gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1466         {
1467           ok = false;
1468           break;
1469         }
1470
1471       /* If iv_type is usable as compare type use that - we can elide the
1472          saturation in that case.   */
1473       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1474         {
1475           tree cmp_vectype
1476             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1477           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1478             rgc.compare_type = cmp_vectype;
1479         }
1480       if (!rgc.compare_type)
1481         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1482           {
1483             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1484             if (cmp_bits >= min_ni_width
1485                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1486               {
1487                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1488                 if (!cmp_type)
1489                   continue;
1490
1491                 /* Check whether we can produce the mask with cmp_type.  */
1492                 tree cmp_vectype
1493                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1494                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1495                   {
1496                     rgc.compare_type = cmp_vectype;
1497                     break;
1498                   }
1499               }
1500         }
1501       if (!rgc.compare_type)
1502         {
1503           ok = false;
1504           break;
1505         }
1506     }
1507   if (!ok)
1508     {
1509       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1510       return false;
1511     }
1512
1513   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1514   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1515   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1516   return true;
1517 }
1518
1519 /* Check whether we can use vector access with length based on precison
1520    comparison.  So far, to keep it simple, we only allow the case that the
1521    precision of the target supported length is larger than the precision
1522    required by loop niters.  */
1523
1524 static bool
1525 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1526 {
1527   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1528     return false;
1529
1530   machine_mode len_load_mode, len_store_mode;
1531   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1532          .exists (&len_load_mode))
1533     return false;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1535          .exists (&len_store_mode))
1536     return false;
1537
1538   signed char partial_load_bias = internal_len_load_store_bias
1539     (IFN_LEN_LOAD, len_load_mode);
1540
1541   signed char partial_store_bias = internal_len_load_store_bias
1542     (IFN_LEN_STORE, len_store_mode);
1543
1544   gcc_assert (partial_load_bias == partial_store_bias);
1545
1546   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1547     return false;
1548
1549   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1550      len_loads with a length of zero.  In order to avoid that we prohibit
1551      more than one loop length here.  */
1552   if (partial_load_bias == -1
1553       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1554     return false;
1555
1556   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1557
1558   unsigned int max_nitems_per_iter = 1;
1559   unsigned int i;
1560   rgroup_controls *rgl;
1561   /* Find the maximum number of items per iteration for every rgroup.  */
1562   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1563     {
1564       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1565       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1566     }
1567
1568   /* Work out how many bits we need to represent the length limit.  */
1569   unsigned int min_ni_prec
1570     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1571
1572   /* Now use the maximum of below precisions for one suitable IV type:
1573      - the IV's natural precision
1574      - the precision needed to hold: the maximum number of scalar
1575        iterations multiplied by the scale factor (min_ni_prec above)
1576      - the Pmode precision
1577
1578      If min_ni_prec is less than the precision of the current niters,
1579      we perfer to still use the niters type.  Prefer to use Pmode and
1580      wider IV to avoid narrow conversions.  */
1581
1582   unsigned int ni_prec
1583     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1584   min_ni_prec = MAX (min_ni_prec, ni_prec);
1585   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1586
1587   tree iv_type = NULL_TREE;
1588   opt_scalar_int_mode tmode_iter;
1589   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1590     {
1591       scalar_mode tmode = tmode_iter.require ();
1592       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1593
1594       /* ??? Do we really want to construct one IV whose precision exceeds
1595          BITS_PER_WORD?  */
1596       if (tbits > BITS_PER_WORD)
1597         break;
1598
1599       /* Find the first available standard integral type.  */
1600       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1601         {
1602           iv_type = build_nonstandard_integer_type (tbits, true);
1603           break;
1604         }
1605     }
1606
1607   if (!iv_type)
1608     {
1609       if (dump_enabled_p ())
1610         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611                          "can't vectorize with length-based partial vectors"
1612                          " because there is no suitable iv type.\n");
1613       return false;
1614     }
1615
1616   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1617   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1618   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1619
1620   return true;
1621 }
1622
1623 /* Calculate the cost of one scalar iteration of the loop.  */
1624 static void
1625 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1626 {
1627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1629   int nbbs = loop->num_nodes, factor;
1630   int innerloop_iters, i;
1631
1632   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1633
1634   /* Gather costs for statements in the scalar loop.  */
1635
1636   /* FORNOW.  */
1637   innerloop_iters = 1;
1638   if (loop->inner)
1639     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1640
1641   for (i = 0; i < nbbs; i++)
1642     {
1643       gimple_stmt_iterator si;
1644       basic_block bb = bbs[i];
1645
1646       if (bb->loop_father == loop->inner)
1647         factor = innerloop_iters;
1648       else
1649         factor = 1;
1650
1651       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1652         {
1653           gimple *stmt = gsi_stmt (si);
1654           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1655
1656           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1657             continue;
1658
1659           /* Skip stmts that are not vectorized inside the loop.  */
1660           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1661           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1662               && (!STMT_VINFO_LIVE_P (vstmt_info)
1663                   || !VECTORIZABLE_CYCLE_DEF
1664                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1665             continue;
1666
1667           vect_cost_for_stmt kind;
1668           if (STMT_VINFO_DATA_REF (stmt_info))
1669             {
1670               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1671                kind = scalar_load;
1672              else
1673                kind = scalar_store;
1674             }
1675           else if (vect_nop_conversion_p (stmt_info))
1676             continue;
1677           else
1678             kind = scalar_stmt;
1679
1680           /* We are using vect_prologue here to avoid scaling twice
1681              by the inner loop factor.  */
1682           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1683                             factor, kind, stmt_info, 0, vect_prologue);
1684         }
1685     }
1686
1687   /* Now accumulate cost.  */
1688   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1689   add_stmt_costs (loop_vinfo->scalar_costs,
1690                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1691   loop_vinfo->scalar_costs->finish_cost (nullptr);
1692 }
1693
1694
1695 /* Function vect_analyze_loop_form.
1696
1697    Verify that certain CFG restrictions hold, including:
1698    - the loop has a pre-header
1699    - the loop has a single entry and exit
1700    - the loop exit condition is simple enough
1701    - the number of iterations can be analyzed, i.e, a countable loop.  The
1702      niter could be analyzed under some assumptions.  */
1703
1704 opt_result
1705 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1706 {
1707   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1708
1709   edge exit_e = vec_init_loop_exit_info (loop);
1710   if (!exit_e)
1711     return opt_result::failure_at (vect_location,
1712                                    "not vectorized:"
1713                                    " could not determine main exit from"
1714                                    " loop with multiple exits.\n");
1715   info->loop_exit = exit_e;
1716   if (dump_enabled_p ())
1717       dump_printf_loc (MSG_NOTE, vect_location,
1718                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1719                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1720
1721   /* Different restrictions apply when we are considering an inner-most loop,
1722      vs. an outer (nested) loop.
1723      (FORNOW. May want to relax some of these restrictions in the future).  */
1724
1725   info->inner_loop_cond = NULL;
1726   if (!loop->inner)
1727     {
1728       /* Inner-most loop.  We currently require that the number of BBs is
1729          exactly 2 (the header and latch).  Vectorizable inner-most loops
1730          look like this:
1731
1732                         (pre-header)
1733                            |
1734                           header <--------+
1735                            | |            |
1736                            | +--> latch --+
1737                            |
1738                         (exit-bb)  */
1739
1740       if (loop->num_nodes != 2)
1741         return opt_result::failure_at (vect_location,
1742                                        "not vectorized:"
1743                                        " control flow in loop.\n");
1744
1745       if (empty_block_p (loop->header))
1746         return opt_result::failure_at (vect_location,
1747                                        "not vectorized: empty loop.\n");
1748     }
1749   else
1750     {
1751       class loop *innerloop = loop->inner;
1752       edge entryedge;
1753
1754       /* Nested loop. We currently require that the loop is doubly-nested,
1755          contains a single inner loop, and the number of BBs is exactly 5.
1756          Vectorizable outer-loops look like this:
1757
1758                         (pre-header)
1759                            |
1760                           header <---+
1761                            |         |
1762                           inner-loop |
1763                            |         |
1764                           tail ------+
1765                            |
1766                         (exit-bb)
1767
1768          The inner-loop has the properties expected of inner-most loops
1769          as described above.  */
1770
1771       if ((loop->inner)->inner || (loop->inner)->next)
1772         return opt_result::failure_at (vect_location,
1773                                        "not vectorized:"
1774                                        " multiple nested loops.\n");
1775
1776       if (loop->num_nodes != 5)
1777         return opt_result::failure_at (vect_location,
1778                                        "not vectorized:"
1779                                        " control flow in loop.\n");
1780
1781       entryedge = loop_preheader_edge (innerloop);
1782       if (entryedge->src != loop->header
1783           || !single_exit (innerloop)
1784           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1785         return opt_result::failure_at (vect_location,
1786                                        "not vectorized:"
1787                                        " unsupported outerloop form.\n");
1788
1789       /* Analyze the inner-loop.  */
1790       vect_loop_form_info inner;
1791       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1792       if (!res)
1793         {
1794           if (dump_enabled_p ())
1795             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796                              "not vectorized: Bad inner loop.\n");
1797           return res;
1798         }
1799
1800       /* Don't support analyzing niter under assumptions for inner
1801          loop.  */
1802       if (!integer_onep (inner.assumptions))
1803         return opt_result::failure_at (vect_location,
1804                                        "not vectorized: Bad inner loop.\n");
1805
1806       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1807         return opt_result::failure_at (vect_location,
1808                                        "not vectorized: inner-loop count not"
1809                                        " invariant.\n");
1810
1811       if (dump_enabled_p ())
1812         dump_printf_loc (MSG_NOTE, vect_location,
1813                          "Considering outer-loop vectorization.\n");
1814       info->inner_loop_cond = inner.conds[0];
1815     }
1816
1817   if (!single_exit (loop))
1818     return opt_result::failure_at (vect_location,
1819                                    "not vectorized: multiple exits.\n");
1820   if (EDGE_COUNT (loop->header->preds) != 2)
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized:"
1823                                    " too many incoming edges.\n");
1824
1825   /* We assume that the loop exit condition is at the end of the loop. i.e,
1826      that the loop is represented as a do-while (with a proper if-guard
1827      before the loop if needed), where the loop header contains all the
1828      executable statements, and the latch is empty.  */
1829   if (!empty_block_p (loop->latch)
1830       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1831     return opt_result::failure_at (vect_location,
1832                                    "not vectorized: latch block not empty.\n");
1833
1834   /* Make sure the exit is not abnormal.  */
1835   if (exit_e->flags & EDGE_ABNORMAL)
1836     return opt_result::failure_at (vect_location,
1837                                    "not vectorized:"
1838                                    " abnormal loop exit edge.\n");
1839
1840   info->conds
1841     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1842                             &info->number_of_iterations,
1843                             &info->number_of_iterationsm1);
1844
1845   if (info->conds.is_empty ())
1846     return opt_result::failure_at
1847       (vect_location,
1848        "not vectorized: complicated exit condition.\n");
1849
1850   /* Determine what the primary and alternate exit conds are.  */
1851   for (unsigned i = 0; i < info->conds.length (); i++)
1852     {
1853       gcond *cond = info->conds[i];
1854       if (exit_e->src == gimple_bb (cond))
1855         std::swap (info->conds[0], info->conds[i]);
1856     }
1857
1858   if (integer_zerop (info->assumptions)
1859       || !info->number_of_iterations
1860       || chrec_contains_undetermined (info->number_of_iterations))
1861     return opt_result::failure_at
1862       (info->conds[0],
1863        "not vectorized: number of iterations cannot be computed.\n");
1864
1865   if (integer_zerop (info->number_of_iterations))
1866     return opt_result::failure_at
1867       (info->conds[0],
1868        "not vectorized: number of iterations = 0.\n");
1869
1870   if (!(tree_fits_shwi_p (info->number_of_iterations)
1871         && tree_to_shwi (info->number_of_iterations) > 0))
1872     {
1873       if (dump_enabled_p ())
1874         {
1875           dump_printf_loc (MSG_NOTE, vect_location,
1876                            "Symbolic number of iterations is ");
1877           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1878           dump_printf (MSG_NOTE, "\n");
1879         }
1880     }
1881
1882   return opt_result::success ();
1883 }
1884
1885 /* Create a loop_vec_info for LOOP with SHARED and the
1886    vect_analyze_loop_form result.  */
1887
1888 loop_vec_info
1889 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1890                         const vect_loop_form_info *info,
1891                         loop_vec_info main_loop_info)
1892 {
1893   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1894   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1895   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1896   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1897   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1898   /* Also record the assumptions for versioning.  */
1899   if (!integer_onep (info->assumptions) && !main_loop_info)
1900     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1901
1902   for (gcond *cond : info->conds)
1903     {
1904       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1905       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1906     }
1907
1908   for (unsigned i = 1; i < info->conds.length (); i ++)
1909     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1910   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1911
1912   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1913
1914   if (info->inner_loop_cond)
1915     {
1916       stmt_vec_info inner_loop_cond_info
1917         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1918       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1919       /* If we have an estimate on the number of iterations of the inner
1920          loop use that to limit the scale for costing, otherwise use
1921          --param vect-inner-loop-cost-factor literally.  */
1922       widest_int nit;
1923       if (estimated_stmt_executions (loop->inner, &nit))
1924         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1925           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1926     }
1927
1928   return loop_vinfo;
1929 }
1930
1931
1932
1933 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1934    statements update the vectorization factor.  */
1935
1936 static void
1937 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1938 {
1939   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1940   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1941   int nbbs = loop->num_nodes;
1942   poly_uint64 vectorization_factor;
1943   int i;
1944
1945   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1946
1947   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   gcc_assert (known_ne (vectorization_factor, 0U));
1949
1950   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1951      vectorization factor of the loop is the unrolling factor required by
1952      the SLP instances.  If that unrolling factor is 1, we say, that we
1953      perform pure SLP on loop - cross iteration parallelism is not
1954      exploited.  */
1955   bool only_slp_in_loop = true;
1956   for (i = 0; i < nbbs; i++)
1957     {
1958       basic_block bb = bbs[i];
1959       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1960            gsi_next (&si))
1961         {
1962           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1963           if (!stmt_info)
1964             continue;
1965           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1966                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1967               && !PURE_SLP_STMT (stmt_info))
1968             /* STMT needs both SLP and loop-based vectorization.  */
1969             only_slp_in_loop = false;
1970         }
1971       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1972            gsi_next (&si))
1973         {
1974           if (is_gimple_debug (gsi_stmt (si)))
1975             continue;
1976           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1977           stmt_info = vect_stmt_to_vectorize (stmt_info);
1978           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1979                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1980               && !PURE_SLP_STMT (stmt_info))
1981             /* STMT needs both SLP and loop-based vectorization.  */
1982             only_slp_in_loop = false;
1983         }
1984     }
1985
1986   if (only_slp_in_loop)
1987     {
1988       if (dump_enabled_p ())
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "Loop contains only SLP stmts\n");
1991       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1992     }
1993   else
1994     {
1995       if (dump_enabled_p ())
1996         dump_printf_loc (MSG_NOTE, vect_location,
1997                          "Loop contains SLP and non-SLP stmts\n");
1998       /* Both the vectorization factor and unroll factor have the form
1999          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2000          so they must have a common multiple.  */
2001       vectorization_factor
2002         = force_common_multiple (vectorization_factor,
2003                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2004     }
2005
2006   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2007   if (dump_enabled_p ())
2008     {
2009       dump_printf_loc (MSG_NOTE, vect_location,
2010                        "Updating vectorization factor to ");
2011       dump_dec (MSG_NOTE, vectorization_factor);
2012       dump_printf (MSG_NOTE, ".\n");
2013     }
2014 }
2015
2016 /* Return true if STMT_INFO describes a double reduction phi and if
2017    the other phi in the reduction is also relevant for vectorization.
2018    This rejects cases such as:
2019
2020       outer1:
2021         x_1 = PHI <x_3(outer2), ...>;
2022         ...
2023
2024       inner:
2025         x_2 = ...;
2026         ...
2027
2028       outer2:
2029         x_3 = PHI <x_2(inner)>;
2030
2031    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2032
2033 static bool
2034 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2035 {
2036   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2037     return false;
2038
2039   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2040 }
2041
2042 /* Function vect_analyze_loop_operations.
2043
2044    Scan the loop stmts and make sure they are all vectorizable.  */
2045
2046 static opt_result
2047 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2048 {
2049   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2051   int nbbs = loop->num_nodes;
2052   int i;
2053   stmt_vec_info stmt_info;
2054   bool need_to_vectorize = false;
2055   bool ok;
2056
2057   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2058
2059   auto_vec<stmt_info_for_cost> cost_vec;
2060
2061   for (i = 0; i < nbbs; i++)
2062     {
2063       basic_block bb = bbs[i];
2064
2065       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2066            gsi_next (&si))
2067         {
2068           gphi *phi = si.phi ();
2069           ok = true;
2070
2071           stmt_info = loop_vinfo->lookup_stmt (phi);
2072           if (dump_enabled_p ())
2073             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2074                              (gimple *) phi);
2075           if (virtual_operand_p (gimple_phi_result (phi)))
2076             continue;
2077
2078           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2079              (i.e., a phi in the tail of the outer-loop).  */
2080           if (! is_loop_header_bb_p (bb))
2081             {
2082               /* FORNOW: we currently don't support the case that these phis
2083                  are not used in the outerloop (unless it is double reduction,
2084                  i.e., this phi is vect_reduction_def), cause this case
2085                  requires to actually do something here.  */
2086               if (STMT_VINFO_LIVE_P (stmt_info)
2087                   && !vect_active_double_reduction_p (stmt_info))
2088                 return opt_result::failure_at (phi,
2089                                                "Unsupported loop-closed phi"
2090                                                " in outer-loop.\n");
2091
2092               /* If PHI is used in the outer loop, we check that its operand
2093                  is defined in the inner loop.  */
2094               if (STMT_VINFO_RELEVANT_P (stmt_info))
2095                 {
2096                   tree phi_op;
2097
2098                   if (gimple_phi_num_args (phi) != 1)
2099                     return opt_result::failure_at (phi, "unsupported phi");
2100
2101                   phi_op = PHI_ARG_DEF (phi, 0);
2102                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2103                   if (!op_def_info)
2104                     return opt_result::failure_at (phi, "unsupported phi\n");
2105
2106                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2107                       && (STMT_VINFO_RELEVANT (op_def_info)
2108                           != vect_used_in_outer_by_reduction))
2109                     return opt_result::failure_at (phi, "unsupported phi\n");
2110
2111                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2112                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2113                            == vect_double_reduction_def))
2114                       && !vectorizable_lc_phi (loop_vinfo,
2115                                                stmt_info, NULL, NULL))
2116                     return opt_result::failure_at (phi, "unsupported phi\n");
2117                 }
2118
2119               continue;
2120             }
2121
2122           gcc_assert (stmt_info);
2123
2124           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2125                || STMT_VINFO_LIVE_P (stmt_info))
2126               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2127               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2128             /* A scalar-dependence cycle that we don't support.  */
2129             return opt_result::failure_at (phi,
2130                                            "not vectorized:"
2131                                            " scalar dependence cycle.\n");
2132
2133           if (STMT_VINFO_RELEVANT_P (stmt_info))
2134             {
2135               need_to_vectorize = true;
2136               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2137                   && ! PURE_SLP_STMT (stmt_info))
2138                 ok = vectorizable_induction (loop_vinfo,
2139                                              stmt_info, NULL, NULL,
2140                                              &cost_vec);
2141               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2142                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2143                             == vect_double_reduction_def)
2144                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2145                        && ! PURE_SLP_STMT (stmt_info))
2146                 ok = vectorizable_reduction (loop_vinfo,
2147                                              stmt_info, NULL, NULL, &cost_vec);
2148               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2149                         == vect_first_order_recurrence)
2150                        && ! PURE_SLP_STMT (stmt_info))
2151                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2152                                            &cost_vec);
2153             }
2154
2155           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2156           if (ok
2157               && STMT_VINFO_LIVE_P (stmt_info)
2158               && !PURE_SLP_STMT (stmt_info))
2159             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2160                                               -1, false, &cost_vec);
2161
2162           if (!ok)
2163             return opt_result::failure_at (phi,
2164                                            "not vectorized: relevant phi not "
2165                                            "supported: %G",
2166                                            static_cast <gimple *> (phi));
2167         }
2168
2169       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2170            gsi_next (&si))
2171         {
2172           gimple *stmt = gsi_stmt (si);
2173           if (!gimple_clobber_p (stmt)
2174               && !is_gimple_debug (stmt))
2175             {
2176               opt_result res
2177                 = vect_analyze_stmt (loop_vinfo,
2178                                      loop_vinfo->lookup_stmt (stmt),
2179                                      &need_to_vectorize,
2180                                      NULL, NULL, &cost_vec);
2181               if (!res)
2182                 return res;
2183             }
2184         }
2185     } /* bbs */
2186
2187   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2188
2189   /* All operations in the loop are either irrelevant (deal with loop
2190      control, or dead), or only used outside the loop and can be moved
2191      out of the loop (e.g. invariants, inductions).  The loop can be
2192      optimized away by scalar optimizations.  We're better off not
2193      touching this loop.  */
2194   if (!need_to_vectorize)
2195     {
2196       if (dump_enabled_p ())
2197         dump_printf_loc (MSG_NOTE, vect_location,
2198                          "All the computation can be taken out of the loop.\n");
2199       return opt_result::failure_at
2200         (vect_location,
2201          "not vectorized: redundant loop. no profit to vectorize.\n");
2202     }
2203
2204   return opt_result::success ();
2205 }
2206
2207 /* Return true if we know that the iteration count is smaller than the
2208    vectorization factor.  Return false if it isn't, or if we can't be sure
2209    either way.  */
2210
2211 static bool
2212 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2213 {
2214   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2215
2216   HOST_WIDE_INT max_niter;
2217   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2218     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2219   else
2220     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2221
2222   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2223     return true;
2224
2225   return false;
2226 }
2227
2228 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2229    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2230    definitely no, or -1 if it's worth retrying.  */
2231
2232 static int
2233 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2234                            unsigned *suggested_unroll_factor)
2235 {
2236   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2238
2239   /* Only loops that can handle partially-populated vectors can have iteration
2240      counts less than the vectorization factor.  */
2241   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2242       && vect_known_niters_smaller_than_vf (loop_vinfo))
2243     {
2244       if (dump_enabled_p ())
2245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                          "not vectorized: iteration count smaller than "
2247                          "vectorization factor.\n");
2248       return 0;
2249     }
2250
2251   /* If we know the number of iterations we can do better, for the
2252      epilogue we can also decide whether the main loop leaves us
2253      with enough iterations, prefering a smaller vector epilog then
2254      also possibly used for the case we skip the vector loop.  */
2255   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2256     {
2257       widest_int scalar_niters
2258         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2259       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2260         {
2261           loop_vec_info orig_loop_vinfo
2262             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2263           unsigned lowest_vf
2264             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2265           int prolog_peeling = 0;
2266           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2267             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2268           if (prolog_peeling >= 0
2269               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2270                            lowest_vf))
2271             {
2272               unsigned gap
2273                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2274               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2275                                % lowest_vf + gap);
2276             }
2277         }
2278       /* Reject vectorizing for a single scalar iteration, even if
2279          we could in principle implement that using partial vectors.  */
2280       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2281       if (scalar_niters <= peeling_gap + 1)
2282         {
2283           if (dump_enabled_p ())
2284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                              "not vectorized: loop only has a single "
2286                              "scalar iteration.\n");
2287           return 0;
2288         }
2289
2290       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2291         {
2292           /* Check that the loop processes at least one full vector.  */
2293           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2294           if (known_lt (scalar_niters, vf))
2295             {
2296               if (dump_enabled_p ())
2297                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                                  "loop does not have enough iterations "
2299                                  "to support vectorization.\n");
2300               return 0;
2301             }
2302
2303           /* If we need to peel an extra epilogue iteration to handle data
2304              accesses with gaps, check that there are enough scalar iterations
2305              available.
2306
2307              The check above is redundant with this one when peeling for gaps,
2308              but the distinction is useful for diagnostics.  */
2309           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2310               && known_le (scalar_niters, vf))
2311             {
2312               if (dump_enabled_p ())
2313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                                  "loop does not have enough iterations "
2315                                  "to support peeling for gaps.\n");
2316               return 0;
2317             }
2318         }
2319     }
2320
2321   /* If using the "very cheap" model. reject cases in which we'd keep
2322      a copy of the scalar code (even if we might be able to vectorize it).  */
2323   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2324       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2325           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2327     {
2328       if (dump_enabled_p ())
2329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330                          "some scalar iterations would need to be peeled\n");
2331       return 0;
2332     }
2333
2334   int min_profitable_iters, min_profitable_estimate;
2335   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2336                                       &min_profitable_estimate,
2337                                       suggested_unroll_factor);
2338
2339   if (min_profitable_iters < 0)
2340     {
2341       if (dump_enabled_p ())
2342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343                          "not vectorized: vectorization not profitable.\n");
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vector version will never be "
2347                          "profitable.\n");
2348       return -1;
2349     }
2350
2351   int min_scalar_loop_bound = (param_min_vect_loop_bound
2352                                * assumed_vf);
2353
2354   /* Use the cost model only if it is more conservative than user specified
2355      threshold.  */
2356   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2357                                     min_profitable_iters);
2358
2359   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2360
2361   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2362       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2363     {
2364       if (dump_enabled_p ())
2365         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2366                          "not vectorized: vectorization not profitable.\n");
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_NOTE, vect_location,
2369                          "not vectorized: iteration count smaller than user "
2370                          "specified loop bound parameter or minimum profitable "
2371                          "iterations (whichever is more conservative).\n");
2372       return 0;
2373     }
2374
2375   /* The static profitablity threshold min_profitable_estimate includes
2376      the cost of having to check at runtime whether the scalar loop
2377      should be used instead.  If it turns out that we don't need or want
2378      such a check, the threshold we should use for the static estimate
2379      is simply the point at which the vector loop becomes more profitable
2380      than the scalar loop.  */
2381   if (min_profitable_estimate > min_profitable_iters
2382       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2383       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2384       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2385       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2386     {
2387       if (dump_enabled_p ())
2388         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2389                          " choice between the scalar and vector loops\n");
2390       min_profitable_estimate = min_profitable_iters;
2391     }
2392
2393   /* If the vector loop needs multiple iterations to be beneficial then
2394      things are probably too close to call, and the conservative thing
2395      would be to stick with the scalar code.  */
2396   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2397       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2398     {
2399       if (dump_enabled_p ())
2400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                          "one iteration of the vector loop would be"
2402                          " more expensive than the equivalent number of"
2403                          " iterations of the scalar loop\n");
2404       return 0;
2405     }
2406
2407   HOST_WIDE_INT estimated_niter;
2408
2409   /* If we are vectorizing an epilogue then we know the maximum number of
2410      scalar iterations it will cover is at least one lower than the
2411      vectorization factor of the main loop.  */
2412   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2413     estimated_niter
2414       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2415   else
2416     {
2417       estimated_niter = estimated_stmt_executions_int (loop);
2418       if (estimated_niter == -1)
2419         estimated_niter = likely_max_stmt_executions_int (loop);
2420     }
2421   if (estimated_niter != -1
2422       && ((unsigned HOST_WIDE_INT) estimated_niter
2423           < MAX (th, (unsigned) min_profitable_estimate)))
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427                          "not vectorized: estimated iteration count too "
2428                          "small.\n");
2429       if (dump_enabled_p ())
2430         dump_printf_loc (MSG_NOTE, vect_location,
2431                          "not vectorized: estimated iteration count smaller "
2432                          "than specified loop bound parameter or minimum "
2433                          "profitable iterations (whichever is more "
2434                          "conservative).\n");
2435       return -1;
2436     }
2437
2438   return 1;
2439 }
2440
2441 static opt_result
2442 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2443                            vec<data_reference_p> *datarefs,
2444                            unsigned int *n_stmts)
2445 {
2446   *n_stmts = 0;
2447   for (unsigned i = 0; i < loop->num_nodes; i++)
2448     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2449          !gsi_end_p (gsi); gsi_next (&gsi))
2450       {
2451         gimple *stmt = gsi_stmt (gsi);
2452         if (is_gimple_debug (stmt))
2453           continue;
2454         ++(*n_stmts);
2455         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2456                                                         NULL, 0);
2457         if (!res)
2458           {
2459             if (is_gimple_call (stmt) && loop->safelen)
2460               {
2461                 tree fndecl = gimple_call_fndecl (stmt), op;
2462                 if (fndecl == NULL_TREE
2463                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2464                   {
2465                     fndecl = gimple_call_arg (stmt, 0);
2466                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2467                     fndecl = TREE_OPERAND (fndecl, 0);
2468                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2469                   }
2470                 if (fndecl != NULL_TREE)
2471                   {
2472                     cgraph_node *node = cgraph_node::get (fndecl);
2473                     if (node != NULL && node->simd_clones != NULL)
2474                       {
2475                         unsigned int j, n = gimple_call_num_args (stmt);
2476                         for (j = 0; j < n; j++)
2477                           {
2478                             op = gimple_call_arg (stmt, j);
2479                             if (DECL_P (op)
2480                                 || (REFERENCE_CLASS_P (op)
2481                                     && get_base_address (op)))
2482                               break;
2483                           }
2484                         op = gimple_call_lhs (stmt);
2485                         /* Ignore #pragma omp declare simd functions
2486                            if they don't have data references in the
2487                            call stmt itself.  */
2488                         if (j == n
2489                             && !(op
2490                                  && (DECL_P (op)
2491                                      || (REFERENCE_CLASS_P (op)
2492                                          && get_base_address (op)))))
2493                           continue;
2494                       }
2495                   }
2496               }
2497             return res;
2498           }
2499         /* If dependence analysis will give up due to the limit on the
2500            number of datarefs stop here and fail fatally.  */
2501         if (datarefs->length ()
2502             > (unsigned)param_loop_max_datarefs_for_datadeps)
2503           return opt_result::failure_at (stmt, "exceeded param "
2504                                          "loop-max-datarefs-for-datadeps\n");
2505       }
2506   return opt_result::success ();
2507 }
2508
2509 /* Look for SLP-only access groups and turn each individual access into its own
2510    group.  */
2511 static void
2512 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2513 {
2514   unsigned int i;
2515   struct data_reference *dr;
2516
2517   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2518
2519   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2520   FOR_EACH_VEC_ELT (datarefs, i, dr)
2521     {
2522       gcc_assert (DR_REF (dr));
2523       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2524
2525       /* Check if the load is a part of an interleaving chain.  */
2526       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2527         {
2528           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2529           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2530           unsigned int group_size = DR_GROUP_SIZE (first_element);
2531
2532           /* Check if SLP-only groups.  */
2533           if (!STMT_SLP_TYPE (stmt_info)
2534               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2535             {
2536               /* Dissolve the group.  */
2537               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2538
2539               stmt_vec_info vinfo = first_element;
2540               while (vinfo)
2541                 {
2542                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2543                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2544                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2545                   DR_GROUP_SIZE (vinfo) = 1;
2546                   if (STMT_VINFO_STRIDED_P (first_element)
2547                       /* We cannot handle stores with gaps.  */
2548                       || DR_IS_WRITE (dr_info->dr))
2549                     {
2550                       STMT_VINFO_STRIDED_P (vinfo) = true;
2551                       DR_GROUP_GAP (vinfo) = 0;
2552                     }
2553                   else
2554                     DR_GROUP_GAP (vinfo) = group_size - 1;
2555                   /* Duplicate and adjust alignment info, it needs to
2556                      be present on each group leader, see dr_misalignment.  */
2557                   if (vinfo != first_element)
2558                     {
2559                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2560                       dr_info2->target_alignment = dr_info->target_alignment;
2561                       int misalignment = dr_info->misalignment;
2562                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2563                         {
2564                           HOST_WIDE_INT diff
2565                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2566                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2567                           unsigned HOST_WIDE_INT align_c
2568                             = dr_info->target_alignment.to_constant ();
2569                           misalignment = (misalignment + diff) % align_c;
2570                         }
2571                       dr_info2->misalignment = misalignment;
2572                     }
2573                   vinfo = next;
2574                 }
2575             }
2576         }
2577     }
2578 }
2579
2580 /* Determine if operating on full vectors for LOOP_VINFO might leave
2581    some scalar iterations still to do.  If so, decide how we should
2582    handle those scalar iterations.  The possibilities are:
2583
2584    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2585        In this case:
2586
2587          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2588          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2589          LOOP_VINFO_PEELING_FOR_NITER == false
2590
2591    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2592        to handle the remaining scalar iterations.  In this case:
2593
2594          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2595          LOOP_VINFO_PEELING_FOR_NITER == true
2596
2597        There are two choices:
2598
2599        (2a) Consider vectorizing the epilogue loop at the same VF as the
2600             main loop, but using partial vectors instead of full vectors.
2601             In this case:
2602
2603               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2604
2605        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2606             In this case:
2607
2608               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2609  */
2610
2611 opt_result
2612 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2613 {
2614   /* Determine whether there would be any scalar iterations left over.  */
2615   bool need_peeling_or_partial_vectors_p
2616     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2617
2618   /* Decide whether to vectorize the loop with partial vectors.  */
2619   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2620   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2621   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2622       && need_peeling_or_partial_vectors_p)
2623     {
2624       /* For partial-vector-usage=1, try to push the handling of partial
2625          vectors to the epilogue, with the main loop continuing to operate
2626          on full vectors.
2627
2628          If we are unrolling we also do not want to use partial vectors. This
2629          is to avoid the overhead of generating multiple masks and also to
2630          avoid having to execute entire iterations of FALSE masked instructions
2631          when dealing with one or less full iterations.
2632
2633          ??? We could then end up failing to use partial vectors if we
2634          decide to peel iterations into a prologue, and if the main loop
2635          then ends up processing fewer than VF iterations.  */
2636       if ((param_vect_partial_vector_usage == 1
2637            || loop_vinfo->suggested_unroll_factor > 1)
2638           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2639           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2640         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2641       else
2642         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2643     }
2644
2645   if (dump_enabled_p ())
2646     dump_printf_loc (MSG_NOTE, vect_location,
2647                      "operating on %s vectors%s.\n",
2648                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2649                      ? "partial" : "full",
2650                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651                      ? " for epilogue loop" : "");
2652
2653   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2654     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2655        && need_peeling_or_partial_vectors_p);
2656
2657   return opt_result::success ();
2658 }
2659
2660 /* Function vect_analyze_loop_2.
2661
2662    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2663    analyses will record information in some members of LOOP_VINFO.  FATAL
2664    indicates if some analysis meets fatal error.  If one non-NULL pointer
2665    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2666    worked out suggested unroll factor, while one NULL pointer shows it's
2667    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2668    is to hold the slp decision when the suggested unroll factor is worked
2669    out.  */
2670 static opt_result
2671 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2672                      unsigned *suggested_unroll_factor,
2673                      bool& slp_done_for_suggested_uf)
2674 {
2675   opt_result ok = opt_result::success ();
2676   int res;
2677   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2678   poly_uint64 min_vf = 2;
2679   loop_vec_info orig_loop_vinfo = NULL;
2680
2681   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2682      loop_vec_info of the first vectorized loop.  */
2683   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2684     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2685   else
2686     orig_loop_vinfo = loop_vinfo;
2687   gcc_assert (orig_loop_vinfo);
2688
2689   /* The first group of checks is independent of the vector size.  */
2690   fatal = true;
2691
2692   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2693       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2694     return opt_result::failure_at (vect_location,
2695                                    "not vectorized: simd if(0)\n");
2696
2697   /* Find all data references in the loop (which correspond to vdefs/vuses)
2698      and analyze their evolution in the loop.  */
2699
2700   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2701
2702   /* Gather the data references and count stmts in the loop.  */
2703   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2704     {
2705       opt_result res
2706         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2707                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2708                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2709       if (!res)
2710         {
2711           if (dump_enabled_p ())
2712             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713                              "not vectorized: loop contains function "
2714                              "calls or data references that cannot "
2715                              "be analyzed\n");
2716           return res;
2717         }
2718       loop_vinfo->shared->save_datarefs ();
2719     }
2720   else
2721     loop_vinfo->shared->check_datarefs ();
2722
2723   /* Analyze the data references and also adjust the minimal
2724      vectorization factor according to the loads and stores.  */
2725
2726   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2727   if (!ok)
2728     {
2729       if (dump_enabled_p ())
2730         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2731                          "bad data references.\n");
2732       return ok;
2733     }
2734
2735   /* Check if we are applying unroll factor now.  */
2736   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2737   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2738
2739   /* If the slp decision is false when suggested unroll factor is worked
2740      out, and we are applying suggested unroll factor, we can simply skip
2741      all slp related analyses this time.  */
2742   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2743
2744   /* Classify all cross-iteration scalar data-flow cycles.
2745      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2746   vect_analyze_scalar_cycles (loop_vinfo, slp);
2747
2748   vect_pattern_recog (loop_vinfo);
2749
2750   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2751
2752   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2753      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2754
2755   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2756   if (!ok)
2757     {
2758       if (dump_enabled_p ())
2759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                          "bad data access.\n");
2761       return ok;
2762     }
2763
2764   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2765
2766   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2767   if (!ok)
2768     {
2769       if (dump_enabled_p ())
2770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771                          "unexpected pattern.\n");
2772       return ok;
2773     }
2774
2775   /* While the rest of the analysis below depends on it in some way.  */
2776   fatal = false;
2777
2778   /* Analyze data dependences between the data-refs in the loop
2779      and adjust the maximum vectorization factor according to
2780      the dependences.
2781      FORNOW: fail at the first data dependence that we encounter.  */
2782
2783   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2784   if (!ok)
2785     {
2786       if (dump_enabled_p ())
2787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788                          "bad data dependence.\n");
2789       return ok;
2790     }
2791   if (max_vf != MAX_VECTORIZATION_FACTOR
2792       && maybe_lt (max_vf, min_vf))
2793     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2794   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2795
2796   ok = vect_determine_vectorization_factor (loop_vinfo);
2797   if (!ok)
2798     {
2799       if (dump_enabled_p ())
2800         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801                          "can't determine vectorization factor.\n");
2802       return ok;
2803     }
2804   if (max_vf != MAX_VECTORIZATION_FACTOR
2805       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2806     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2807
2808   /* Compute the scalar iteration cost.  */
2809   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2810
2811   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812
2813   if (slp)
2814     {
2815       /* Check the SLP opportunities in the loop, analyze and build
2816          SLP trees.  */
2817       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2818       if (!ok)
2819         return ok;
2820
2821       /* If there are any SLP instances mark them as pure_slp.  */
2822       slp = vect_make_slp_decision (loop_vinfo);
2823       if (slp)
2824         {
2825           /* Find stmts that need to be both vectorized and SLPed.  */
2826           vect_detect_hybrid_slp (loop_vinfo);
2827
2828           /* Update the vectorization factor based on the SLP decision.  */
2829           vect_update_vf_for_slp (loop_vinfo);
2830
2831           /* Optimize the SLP graph with the vectorization factor fixed.  */
2832           vect_optimize_slp (loop_vinfo);
2833
2834           /* Gather the loads reachable from the SLP graph entries.  */
2835           vect_gather_slp_loads (loop_vinfo);
2836         }
2837     }
2838
2839   bool saved_can_use_partial_vectors_p
2840     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2841
2842   /* We don't expect to have to roll back to anything other than an empty
2843      set of rgroups.  */
2844   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2845
2846   /* This is the point where we can re-start analysis with SLP forced off.  */
2847 start_over:
2848
2849   /* Apply the suggested unrolling factor, this was determined by the backend
2850      during finish_cost the first time we ran the analyzis for this
2851      vector mode.  */
2852   if (applying_suggested_uf)
2853     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2854
2855   /* Now the vectorization factor is final.  */
2856   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857   gcc_assert (known_ne (vectorization_factor, 0U));
2858
2859   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2860     {
2861       dump_printf_loc (MSG_NOTE, vect_location,
2862                        "vectorization_factor = ");
2863       dump_dec (MSG_NOTE, vectorization_factor);
2864       dump_printf (MSG_NOTE, ", niters = %wd\n",
2865                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2866     }
2867
2868   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2869
2870   /* Analyze the alignment of the data-refs in the loop.
2871      Fail if a data reference is found that cannot be vectorized.  */
2872
2873   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2874   if (!ok)
2875     {
2876       if (dump_enabled_p ())
2877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878                          "bad data alignment.\n");
2879       return ok;
2880     }
2881
2882   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2883      It is important to call pruning after vect_analyze_data_ref_accesses,
2884      since we use grouping information gathered by interleaving analysis.  */
2885   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2886   if (!ok)
2887     return ok;
2888
2889   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2890      vectorization, since we do not want to add extra peeling or
2891      add versioning for alignment.  */
2892   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2893     /* This pass will decide on using loop versioning and/or loop peeling in
2894        order to enhance the alignment of data references in the loop.  */
2895     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2896   if (!ok)
2897     return ok;
2898
2899   if (slp)
2900     {
2901       /* Analyze operations in the SLP instances.  Note this may
2902          remove unsupported SLP instances which makes the above
2903          SLP kind detection invalid.  */
2904       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2905       vect_slp_analyze_operations (loop_vinfo);
2906       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2907         {
2908           ok = opt_result::failure_at (vect_location,
2909                                        "unsupported SLP instances\n");
2910           goto again;
2911         }
2912
2913       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2914       slp_tree load_node, slp_root;
2915       unsigned i, x;
2916       slp_instance instance;
2917       bool can_use_lanes = true;
2918       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2919         {
2920           slp_root = SLP_INSTANCE_TREE (instance);
2921           int group_size = SLP_TREE_LANES (slp_root);
2922           tree vectype = SLP_TREE_VECTYPE (slp_root);
2923           bool loads_permuted = false;
2924           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2925             {
2926               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2927                 continue;
2928               unsigned j;
2929               stmt_vec_info load_info;
2930               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2931                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2932                   {
2933                     loads_permuted = true;
2934                     break;
2935                   }
2936             }
2937
2938           /* If the loads and stores can be handled with load/store-lane
2939              instructions record it and move on to the next instance.  */
2940           if (loads_permuted
2941               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2942               && vect_store_lanes_supported (vectype, group_size, false)
2943                    != IFN_LAST)
2944             {
2945               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2946                 {
2947                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2948                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2949                   /* Use SLP for strided accesses (or if we can't
2950                      load-lanes).  */
2951                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2952                       || vect_load_lanes_supported
2953                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2954                              DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2955                     break;
2956                 }
2957
2958               can_use_lanes
2959                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2960
2961               if (can_use_lanes && dump_enabled_p ())
2962                 dump_printf_loc (MSG_NOTE, vect_location,
2963                                  "SLP instance %p can use load/store-lanes\n",
2964                                  (void *) instance);
2965             }
2966           else
2967             {
2968               can_use_lanes = false;
2969               break;
2970             }
2971         }
2972
2973       /* If all SLP instances can use load/store-lanes abort SLP and try again
2974          with SLP disabled.  */
2975       if (can_use_lanes)
2976         {
2977           ok = opt_result::failure_at (vect_location,
2978                                        "Built SLP cancelled: can use "
2979                                        "load/store-lanes\n");
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982                              "Built SLP cancelled: all SLP instances support "
2983                              "load/store-lanes\n");
2984           goto again;
2985         }
2986     }
2987
2988   /* Dissolve SLP-only groups.  */
2989   vect_dissolve_slp_only_groups (loop_vinfo);
2990
2991   /* Scan all the remaining operations in the loop that are not subject
2992      to SLP and make sure they are vectorizable.  */
2993   ok = vect_analyze_loop_operations (loop_vinfo);
2994   if (!ok)
2995     {
2996       if (dump_enabled_p ())
2997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                          "bad operation or unsupported loop bound.\n");
2999       return ok;
3000     }
3001
3002   /* For now, we don't expect to mix both masking and length approaches for one
3003      loop, disable it if both are recorded.  */
3004   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3005       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3006       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3007     {
3008       if (dump_enabled_p ())
3009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010                          "can't vectorize a loop with partial vectors"
3011                          " because we don't expect to mix different"
3012                          " approaches with partial vectors for the"
3013                          " same loop.\n");
3014       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3015     }
3016
3017   /* If we still have the option of using partial vectors,
3018      check whether we can generate the necessary loop controls.  */
3019   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3020     {
3021       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3022         {
3023           if (!vect_verify_full_masking (loop_vinfo)
3024               && !vect_verify_full_masking_avx512 (loop_vinfo))
3025             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3026         }
3027       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3028         if (!vect_verify_loop_lens (loop_vinfo))
3029           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3030     }
3031
3032   /* If we're vectorizing a loop that uses length "controls" and
3033      can iterate more than once, we apply decrementing IV approach
3034      in loop control.  */
3035   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3036       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3037       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3038       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3039            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3040                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3041     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3042
3043   /* If a loop uses length controls and has a decrementing loop control IV,
3044      we will normally pass that IV through a MIN_EXPR to calcaluate the
3045      basis for the length controls.  E.g. in a loop that processes one
3046      element per scalar iteration, the number of elements would be
3047      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3048
3049      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3050      step, since only the final iteration of the vector loop can have
3051      inactive lanes.
3052
3053      However, some targets have a dedicated instruction for calculating the
3054      preferred length, given the total number of elements that still need to
3055      be processed.  This is encapsulated in the SELECT_VL internal function.
3056
3057      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3058      to determine the basis for the length controls.  However, unlike the
3059      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3060      lanes inactive in any iteration of the vector loop, not just the last
3061      iteration.  This SELECT_VL approach therefore requires us to use pointer
3062      IVs with variable steps.
3063
3064      Once we've decided how many elements should be processed by one
3065      iteration of the vector loop, we need to populate the rgroup controls.
3066      If a loop has multiple rgroups, we need to make sure that those rgroups
3067      "line up" (that is, they must be consistent about which elements are
3068      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3069
3070      In principle, it would be possible to use vect_adjust_loop_lens_control
3071      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3072      However:
3073
3074      (1) In practice, it only makes sense to use SELECT_VL when a vector
3075          operation will be controlled directly by the result.  It is not
3076          worth using SELECT_VL if it would only be the input to other
3077          calculations.
3078
3079      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3080          pointer IV will need N updates by a variable amount (N-1 updates
3081          within the iteration and 1 update to move to the next iteration).
3082
3083      Because of this, we prefer to use the MIN_EXPR approach whenever there
3084      is more than one length control.
3085
3086      In addition, SELECT_VL always operates to a granularity of 1 unit.
3087      If we wanted to use it to control an SLP operation on N consecutive
3088      elements, we would need to make the SELECT_VL inputs measure scalar
3089      iterations (rather than elements) and then multiply the SELECT_VL
3090      result by N.  But using SELECT_VL this way is inefficient because
3091      of (1) above.
3092
3093      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3094         satisfied:
3095
3096      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3097      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3098
3099      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3100      we will fail to gain benefits of following unroll optimizations. We prefer
3101      using the MIN_EXPR approach in this situation.  */
3102   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3103     {
3104       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3105       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3106                                           OPTIMIZE_FOR_SPEED)
3107           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3108           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3109           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3110               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3111         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3112     }
3113
3114   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3115      assuming that the loop will be used as a main loop.  We will redo
3116      this analysis later if we instead decide to use the loop as an
3117      epilogue loop.  */
3118   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3119   if (!ok)
3120     return ok;
3121
3122   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3123      to be able to handle fewer than VF scalars, or needs to have a lower VF
3124      than the main loop.  */
3125   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3126       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3127     {
3128       poly_uint64 unscaled_vf
3129         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3130                      orig_loop_vinfo->suggested_unroll_factor);
3131       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3132         return opt_result::failure_at (vect_location,
3133                                        "Vectorization factor too high for"
3134                                        " epilogue loop.\n");
3135     }
3136
3137   /* Check the costings of the loop make vectorizing worthwhile.  */
3138   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3139   if (res < 0)
3140     {
3141       ok = opt_result::failure_at (vect_location,
3142                                    "Loop costings may not be worthwhile.\n");
3143       goto again;
3144     }
3145   if (!res)
3146     return opt_result::failure_at (vect_location,
3147                                    "Loop costings not worthwhile.\n");
3148
3149   /* If an epilogue loop is required make sure we can create one.  */
3150   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3151       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3152     {
3153       if (dump_enabled_p ())
3154         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3155       if (!vect_can_advance_ivs_p (loop_vinfo)
3156           || !slpeel_can_duplicate_loop_p (loop,
3157                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3158                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3159         {
3160           ok = opt_result::failure_at (vect_location,
3161                                        "not vectorized: can't create required "
3162                                        "epilog loop\n");
3163           goto again;
3164         }
3165     }
3166
3167   /* During peeling, we need to check if number of loop iterations is
3168      enough for both peeled prolog loop and vector loop.  This check
3169      can be merged along with threshold check of loop versioning, so
3170      increase threshold for this case if necessary.
3171
3172      If we are analyzing an epilogue we still want to check what its
3173      versioning threshold would be.  If we decide to vectorize the epilogues we
3174      will want to use the lowest versioning threshold of all epilogues and main
3175      loop.  This will enable us to enter a vectorized epilogue even when
3176      versioning the loop.  We can't simply check whether the epilogue requires
3177      versioning though since we may have skipped some versioning checks when
3178      analyzing the epilogue.  For instance, checks for alias versioning will be
3179      skipped when dealing with epilogues as we assume we already checked them
3180      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3181   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3182     {
3183       poly_uint64 niters_th = 0;
3184       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3185
3186       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3187         {
3188           /* Niters for peeled prolog loop.  */
3189           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3190             {
3191               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3192               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3193               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3194             }
3195           else
3196             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3197         }
3198
3199       /* Niters for at least one iteration of vectorized loop.  */
3200       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3201         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3202       /* One additional iteration because of peeling for gap.  */
3203       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3204         niters_th += 1;
3205
3206       /*  Use the same condition as vect_transform_loop to decide when to use
3207           the cost to determine a versioning threshold.  */
3208       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3209           && ordered_p (th, niters_th))
3210         niters_th = ordered_max (poly_uint64 (th), niters_th);
3211
3212       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3213     }
3214
3215   gcc_assert (known_eq (vectorization_factor,
3216                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3217
3218   slp_done_for_suggested_uf = slp;
3219
3220   /* Ok to vectorize!  */
3221   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3222   return opt_result::success ();
3223
3224 again:
3225   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3226   gcc_assert (!ok);
3227
3228   /* Try again with SLP forced off but if we didn't do any SLP there is
3229      no point in re-trying.  */
3230   if (!slp)
3231     return ok;
3232
3233   /* If the slp decision is true when suggested unroll factor is worked
3234      out, and we are applying suggested unroll factor, we don't need to
3235      re-try any more.  */
3236   if (applying_suggested_uf && slp_done_for_suggested_uf)
3237     return ok;
3238
3239   /* If there are reduction chains re-trying will fail anyway.  */
3240   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3241     return ok;
3242
3243   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3244      via interleaving or lane instructions.  */
3245   slp_instance instance;
3246   slp_tree node;
3247   unsigned i, j;
3248   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3249     {
3250       stmt_vec_info vinfo;
3251       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3252       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3253         continue;
3254       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3255       unsigned int size = DR_GROUP_SIZE (vinfo);
3256       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3257       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3258          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3259          && ! vect_grouped_store_supported (vectype, size))
3260         return opt_result::failure_at (vinfo->stmt,
3261                                        "unsupported grouped store\n");
3262       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3263         {
3264           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3265           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3266           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3267           size = DR_GROUP_SIZE (vinfo);
3268           vectype = STMT_VINFO_VECTYPE (vinfo);
3269           if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3270               && ! vect_grouped_load_supported (vectype, single_element_p,
3271                                                 size))
3272             return opt_result::failure_at (vinfo->stmt,
3273                                            "unsupported grouped load\n");
3274         }
3275     }
3276
3277   if (dump_enabled_p ())
3278     dump_printf_loc (MSG_NOTE, vect_location,
3279                      "re-trying with SLP disabled\n");
3280
3281   /* Roll back state appropriately.  No SLP this time.  */
3282   slp = false;
3283   /* Restore vectorization factor as it were without SLP.  */
3284   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3285   /* Free the SLP instances.  */
3286   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3287     vect_free_slp_instance (instance);
3288   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3289   /* Reset SLP type to loop_vect on all stmts.  */
3290   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3291     {
3292       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3293       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3294            !gsi_end_p (si); gsi_next (&si))
3295         {
3296           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3297           STMT_SLP_TYPE (stmt_info) = loop_vect;
3298           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3299               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3300             {
3301               /* vectorizable_reduction adjusts reduction stmt def-types,
3302                  restore them to that of the PHI.  */
3303               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3304                 = STMT_VINFO_DEF_TYPE (stmt_info);
3305               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3306                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3307                 = STMT_VINFO_DEF_TYPE (stmt_info);
3308             }
3309         }
3310       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3311            !gsi_end_p (si); gsi_next (&si))
3312         {
3313           if (is_gimple_debug (gsi_stmt (si)))
3314             continue;
3315           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3316           STMT_SLP_TYPE (stmt_info) = loop_vect;
3317           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3318             {
3319               stmt_vec_info pattern_stmt_info
3320                 = STMT_VINFO_RELATED_STMT (stmt_info);
3321               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3322                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3323
3324               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3325               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3326               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3327                    !gsi_end_p (pi); gsi_next (&pi))
3328                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3329                   = loop_vect;
3330             }
3331         }
3332     }
3333   /* Free optimized alias test DDRS.  */
3334   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3335   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3336   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3337   /* Reset target cost data.  */
3338   delete loop_vinfo->vector_costs;
3339   loop_vinfo->vector_costs = nullptr;
3340   /* Reset accumulated rgroup information.  */
3341   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3342   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3343   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3344   /* Reset assorted flags.  */
3345   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3346   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3347   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3348   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3349   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3350     = saved_can_use_partial_vectors_p;
3351
3352   goto start_over;
3353 }
3354
3355 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3356    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3357    OLD_LOOP_VINFO is better unless something specifically indicates
3358    otherwise.
3359
3360    Note that this deliberately isn't a partial order.  */
3361
3362 static bool
3363 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3364                           loop_vec_info old_loop_vinfo)
3365 {
3366   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3367   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3368
3369   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3370   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3371
3372   /* Always prefer a VF of loop->simdlen over any other VF.  */
3373   if (loop->simdlen)
3374     {
3375       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3376       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3377       if (new_simdlen_p != old_simdlen_p)
3378         return new_simdlen_p;
3379     }
3380
3381   const auto *old_costs = old_loop_vinfo->vector_costs;
3382   const auto *new_costs = new_loop_vinfo->vector_costs;
3383   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3384     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3385
3386   return new_costs->better_main_loop_than_p (old_costs);
3387 }
3388
3389 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3390    true if we should.  */
3391
3392 static bool
3393 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3394                         loop_vec_info old_loop_vinfo)
3395 {
3396   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3397     return false;
3398
3399   if (dump_enabled_p ())
3400     dump_printf_loc (MSG_NOTE, vect_location,
3401                      "***** Preferring vector mode %s to vector mode %s\n",
3402                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3403                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3404   return true;
3405 }
3406
3407 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3408    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3409    MODE_I to the next mode useful to analyze.
3410    Return the loop_vinfo on success and wrapped null on failure.  */
3411
3412 static opt_loop_vec_info
3413 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3414                      const vect_loop_form_info *loop_form_info,
3415                      loop_vec_info main_loop_vinfo,
3416                      const vector_modes &vector_modes, unsigned &mode_i,
3417                      machine_mode &autodetected_vector_mode,
3418                      bool &fatal)
3419 {
3420   loop_vec_info loop_vinfo
3421     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3422
3423   machine_mode vector_mode = vector_modes[mode_i];
3424   loop_vinfo->vector_mode = vector_mode;
3425   unsigned int suggested_unroll_factor = 1;
3426   bool slp_done_for_suggested_uf = false;
3427
3428   /* Run the main analysis.  */
3429   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3430                                         &suggested_unroll_factor,
3431                                         slp_done_for_suggested_uf);
3432   if (dump_enabled_p ())
3433     dump_printf_loc (MSG_NOTE, vect_location,
3434                      "***** Analysis %s with vector mode %s\n",
3435                      res ? "succeeded" : " failed",
3436                      GET_MODE_NAME (loop_vinfo->vector_mode));
3437
3438   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3439     {
3440       if (dump_enabled_p ())
3441         dump_printf_loc (MSG_NOTE, vect_location,
3442                          "***** Re-trying analysis for unrolling"
3443                          " with unroll factor %d and slp %s.\n",
3444                          suggested_unroll_factor,
3445                          slp_done_for_suggested_uf ? "on" : "off");
3446       loop_vec_info unroll_vinfo
3447         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3448       unroll_vinfo->vector_mode = vector_mode;
3449       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3450       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3451                                                 slp_done_for_suggested_uf);
3452       if (new_res)
3453         {
3454           delete loop_vinfo;
3455           loop_vinfo = unroll_vinfo;
3456         }
3457       else
3458         delete unroll_vinfo;
3459     }
3460
3461   /* Remember the autodetected vector mode.  */
3462   if (vector_mode == VOIDmode)
3463     autodetected_vector_mode = loop_vinfo->vector_mode;
3464
3465   /* Advance mode_i, first skipping modes that would result in the
3466      same analysis result.  */
3467   while (mode_i + 1 < vector_modes.length ()
3468          && vect_chooses_same_modes_p (loop_vinfo,
3469                                        vector_modes[mode_i + 1]))
3470     {
3471       if (dump_enabled_p ())
3472         dump_printf_loc (MSG_NOTE, vect_location,
3473                          "***** The result for vector mode %s would"
3474                          " be the same\n",
3475                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3476       mode_i += 1;
3477     }
3478   if (mode_i + 1 < vector_modes.length ()
3479       && VECTOR_MODE_P (autodetected_vector_mode)
3480       && (related_vector_mode (vector_modes[mode_i + 1],
3481                                GET_MODE_INNER (autodetected_vector_mode))
3482           == autodetected_vector_mode)
3483       && (related_vector_mode (autodetected_vector_mode,
3484                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3485           == vector_modes[mode_i + 1]))
3486     {
3487       if (dump_enabled_p ())
3488         dump_printf_loc (MSG_NOTE, vect_location,
3489                          "***** Skipping vector mode %s, which would"
3490                          " repeat the analysis for %s\n",
3491                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3492                          GET_MODE_NAME (autodetected_vector_mode));
3493       mode_i += 1;
3494     }
3495   mode_i++;
3496
3497   if (!res)
3498     {
3499       delete loop_vinfo;
3500       if (fatal)
3501         gcc_checking_assert (main_loop_vinfo == NULL);
3502       return opt_loop_vec_info::propagate_failure (res);
3503     }
3504
3505   return opt_loop_vec_info::success (loop_vinfo);
3506 }
3507
3508 /* Function vect_analyze_loop.
3509
3510    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3511    for it.  The different analyses will record information in the
3512    loop_vec_info struct.  */
3513 opt_loop_vec_info
3514 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3515 {
3516   DUMP_VECT_SCOPE ("analyze_loop_nest");
3517
3518   if (loop_outer (loop)
3519       && loop_vec_info_for_loop (loop_outer (loop))
3520       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3521     return opt_loop_vec_info::failure_at (vect_location,
3522                                           "outer-loop already vectorized.\n");
3523
3524   if (!find_loop_nest (loop, &shared->loop_nest))
3525     return opt_loop_vec_info::failure_at
3526       (vect_location,
3527        "not vectorized: loop nest containing two or more consecutive inner"
3528        " loops cannot be vectorized\n");
3529
3530   /* Analyze the loop form.  */
3531   vect_loop_form_info loop_form_info;
3532   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3533   if (!res)
3534     {
3535       if (dump_enabled_p ())
3536         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3537                          "bad loop form.\n");
3538       return opt_loop_vec_info::propagate_failure (res);
3539     }
3540   if (!integer_onep (loop_form_info.assumptions))
3541     {
3542       /* We consider to vectorize this loop by versioning it under
3543          some assumptions.  In order to do this, we need to clear
3544          existing information computed by scev and niter analyzer.  */
3545       scev_reset_htab ();
3546       free_numbers_of_iterations_estimates (loop);
3547       /* Also set flag for this loop so that following scev and niter
3548          analysis are done under the assumptions.  */
3549       loop_constraint_set (loop, LOOP_C_FINITE);
3550     }
3551
3552   auto_vector_modes vector_modes;
3553   /* Autodetect first vector size we try.  */
3554   vector_modes.safe_push (VOIDmode);
3555   unsigned int autovec_flags
3556     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3557                                                     loop->simdlen != 0);
3558   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3559                              && !unlimited_cost_model (loop));
3560   machine_mode autodetected_vector_mode = VOIDmode;
3561   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3562   unsigned int mode_i = 0;
3563   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3564
3565   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3566      a mode has not been analyzed.  */
3567   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3568   for (unsigned i = 0; i < vector_modes.length (); ++i)
3569     cached_vf_per_mode.safe_push (0);
3570
3571   /* First determine the main loop vectorization mode, either the first
3572      one that works, starting with auto-detecting the vector mode and then
3573      following the targets order of preference, or the one with the
3574      lowest cost if pick_lowest_cost_p.  */
3575   while (1)
3576     {
3577       bool fatal;
3578       unsigned int last_mode_i = mode_i;
3579       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3580          failed.  */
3581       cached_vf_per_mode[last_mode_i] = -1;
3582       opt_loop_vec_info loop_vinfo
3583         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3584                                NULL, vector_modes, mode_i,
3585                                autodetected_vector_mode, fatal);
3586       if (fatal)
3587         break;
3588
3589       if (loop_vinfo)
3590         {
3591           /*  Analyzis has been successful so update the VF value.  The
3592               VF should always be a multiple of unroll_factor and we want to
3593               capture the original VF here.  */
3594           cached_vf_per_mode[last_mode_i]
3595             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3596                          loop_vinfo->suggested_unroll_factor);
3597           /* Once we hit the desired simdlen for the first time,
3598              discard any previous attempts.  */
3599           if (simdlen
3600               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3601             {
3602               delete first_loop_vinfo;
3603               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3604               simdlen = 0;
3605             }
3606           else if (pick_lowest_cost_p
3607                    && first_loop_vinfo
3608                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3609             {
3610               /* Pick loop_vinfo over first_loop_vinfo.  */
3611               delete first_loop_vinfo;
3612               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3613             }
3614           if (first_loop_vinfo == NULL)
3615             first_loop_vinfo = loop_vinfo;
3616           else
3617             {
3618               delete loop_vinfo;
3619               loop_vinfo = opt_loop_vec_info::success (NULL);
3620             }
3621
3622           /* Commit to first_loop_vinfo if we have no reason to try
3623              alternatives.  */
3624           if (!simdlen && !pick_lowest_cost_p)
3625             break;
3626         }
3627       if (mode_i == vector_modes.length ()
3628           || autodetected_vector_mode == VOIDmode)
3629         break;
3630
3631       /* Try the next biggest vector size.  */
3632       if (dump_enabled_p ())
3633         dump_printf_loc (MSG_NOTE, vect_location,
3634                          "***** Re-trying analysis with vector mode %s\n",
3635                          GET_MODE_NAME (vector_modes[mode_i]));
3636     }
3637   if (!first_loop_vinfo)
3638     return opt_loop_vec_info::propagate_failure (res);
3639
3640   if (dump_enabled_p ())
3641     dump_printf_loc (MSG_NOTE, vect_location,
3642                      "***** Choosing vector mode %s\n",
3643                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3644
3645   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3646      enabled, SIMDUID is not set, it is the innermost loop and we have
3647      either already found the loop's SIMDLEN or there was no SIMDLEN to
3648      begin with.
3649      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3650   bool vect_epilogues = (!simdlen
3651                          && loop->inner == NULL
3652                          && param_vect_epilogues_nomask
3653                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3654                          && !loop->simduid);
3655   if (!vect_epilogues)
3656     return first_loop_vinfo;
3657
3658   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3659   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3660
3661   /* For epilogues start the analysis from the first mode.  The motivation
3662      behind starting from the beginning comes from cases where the VECTOR_MODES
3663      array may contain length-agnostic and length-specific modes.  Their
3664      ordering is not guaranteed, so we could end up picking a mode for the main
3665      loop that is after the epilogue's optimal mode.  */
3666   vector_modes[0] = autodetected_vector_mode;
3667   mode_i = 0;
3668
3669   bool supports_partial_vectors =
3670     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3671   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3672
3673   while (1)
3674     {
3675       /* If the target does not support partial vectors we can shorten the
3676          number of modes to analyze for the epilogue as we know we can't pick a
3677          mode that would lead to a VF at least as big as the
3678          FIRST_VINFO_VF.  */
3679       if (!supports_partial_vectors
3680           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3681         {
3682           mode_i++;
3683           if (mode_i == vector_modes.length ())
3684             break;
3685           continue;
3686         }
3687
3688       if (dump_enabled_p ())
3689         dump_printf_loc (MSG_NOTE, vect_location,
3690                          "***** Re-trying epilogue analysis with vector "
3691                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3692
3693       bool fatal;
3694       opt_loop_vec_info loop_vinfo
3695         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3696                                first_loop_vinfo,
3697                                vector_modes, mode_i,
3698                                autodetected_vector_mode, fatal);
3699       if (fatal)
3700         break;
3701
3702       if (loop_vinfo)
3703         {
3704           if (pick_lowest_cost_p)
3705             {
3706               /* Keep trying to roll back vectorization attempts while the
3707                  loop_vec_infos they produced were worse than this one.  */
3708               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3709               while (!vinfos.is_empty ()
3710                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3711                 {
3712                   gcc_assert (vect_epilogues);
3713                   delete vinfos.pop ();
3714                 }
3715             }
3716           /* For now only allow one epilogue loop.  */
3717           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3718             {
3719               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3720               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3721               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3722                           || maybe_ne (lowest_th, 0U));
3723               /* Keep track of the known smallest versioning
3724                  threshold.  */
3725               if (ordered_p (lowest_th, th))
3726                 lowest_th = ordered_min (lowest_th, th);
3727             }
3728           else
3729             {
3730               delete loop_vinfo;
3731               loop_vinfo = opt_loop_vec_info::success (NULL);
3732             }
3733
3734           /* For now only allow one epilogue loop, but allow
3735              pick_lowest_cost_p to replace it, so commit to the
3736              first epilogue if we have no reason to try alternatives.  */
3737           if (!pick_lowest_cost_p)
3738             break;
3739         }
3740
3741       if (mode_i == vector_modes.length ())
3742         break;
3743
3744     }
3745
3746   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3747     {
3748       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3749       if (dump_enabled_p ())
3750         dump_printf_loc (MSG_NOTE, vect_location,
3751                          "***** Choosing epilogue vector mode %s\n",
3752                          GET_MODE_NAME
3753                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3754     }
3755
3756   return first_loop_vinfo;
3757 }
3758
3759 /* Return true if there is an in-order reduction function for CODE, storing
3760    it in *REDUC_FN if so.  */
3761
3762 static bool
3763 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3764 {
3765   if (code == PLUS_EXPR)
3766     {
3767       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3768       return true;
3769     }
3770   return false;
3771 }
3772
3773 /* Function reduction_fn_for_scalar_code
3774
3775    Input:
3776    CODE - tree_code of a reduction operations.
3777
3778    Output:
3779    REDUC_FN - the corresponding internal function to be used to reduce the
3780       vector of partial results into a single scalar result, or IFN_LAST
3781       if the operation is a supported reduction operation, but does not have
3782       such an internal function.
3783
3784    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3785
3786 bool
3787 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3788 {
3789   if (code.is_tree_code ())
3790     switch (tree_code (code))
3791       {
3792       case MAX_EXPR:
3793         *reduc_fn = IFN_REDUC_MAX;
3794         return true;
3795
3796       case MIN_EXPR:
3797         *reduc_fn = IFN_REDUC_MIN;
3798         return true;
3799
3800       case PLUS_EXPR:
3801         *reduc_fn = IFN_REDUC_PLUS;
3802         return true;
3803
3804       case BIT_AND_EXPR:
3805         *reduc_fn = IFN_REDUC_AND;
3806         return true;
3807
3808       case BIT_IOR_EXPR:
3809         *reduc_fn = IFN_REDUC_IOR;
3810         return true;
3811
3812       case BIT_XOR_EXPR:
3813         *reduc_fn = IFN_REDUC_XOR;
3814         return true;
3815
3816       case MULT_EXPR:
3817       case MINUS_EXPR:
3818         *reduc_fn = IFN_LAST;
3819         return true;
3820
3821       default:
3822         return false;
3823       }
3824   else
3825     switch (combined_fn (code))
3826       {
3827       CASE_CFN_FMAX:
3828         *reduc_fn = IFN_REDUC_FMAX;
3829         return true;
3830
3831       CASE_CFN_FMIN:
3832         *reduc_fn = IFN_REDUC_FMIN;
3833         return true;
3834
3835       default:
3836         return false;
3837       }
3838 }
3839
3840 /* If there is a neutral value X such that a reduction would not be affected
3841    by the introduction of additional X elements, return that X, otherwise
3842    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3843    of the scalar elements.  If the reduction has just a single initial value
3844    then INITIAL_VALUE is that value, otherwise it is null.  */
3845
3846 tree
3847 neutral_op_for_reduction (tree scalar_type, code_helper code,
3848                           tree initial_value)
3849 {
3850   if (code.is_tree_code ())
3851     switch (tree_code (code))
3852       {
3853       case WIDEN_SUM_EXPR:
3854       case DOT_PROD_EXPR:
3855       case SAD_EXPR:
3856       case PLUS_EXPR:
3857       case MINUS_EXPR:
3858       case BIT_IOR_EXPR:
3859       case BIT_XOR_EXPR:
3860         return build_zero_cst (scalar_type);
3861
3862       case MULT_EXPR:
3863         return build_one_cst (scalar_type);
3864
3865       case BIT_AND_EXPR:
3866         return build_all_ones_cst (scalar_type);
3867
3868       case MAX_EXPR:
3869       case MIN_EXPR:
3870         return initial_value;
3871
3872       default:
3873         return NULL_TREE;
3874       }
3875   else
3876     switch (combined_fn (code))
3877       {
3878       CASE_CFN_FMIN:
3879       CASE_CFN_FMAX:
3880         return initial_value;
3881
3882       default:
3883         return NULL_TREE;
3884       }
3885 }
3886
3887 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3888    STMT is printed with a message MSG. */
3889
3890 static void
3891 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3892 {
3893   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3894 }
3895
3896 /* Return true if we need an in-order reduction for operation CODE
3897    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3898    overflow must wrap.  */
3899
3900 bool
3901 needs_fold_left_reduction_p (tree type, code_helper code)
3902 {
3903   /* CHECKME: check for !flag_finite_math_only too?  */
3904   if (SCALAR_FLOAT_TYPE_P (type))
3905     {
3906       if (code.is_tree_code ())
3907         switch (tree_code (code))
3908           {
3909           case MIN_EXPR:
3910           case MAX_EXPR:
3911             return false;
3912
3913           default:
3914             return !flag_associative_math;
3915           }
3916       else
3917         switch (combined_fn (code))
3918           {
3919           CASE_CFN_FMIN:
3920           CASE_CFN_FMAX:
3921             return false;
3922
3923           default:
3924             return !flag_associative_math;
3925           }
3926     }
3927
3928   if (INTEGRAL_TYPE_P (type))
3929     return (!code.is_tree_code ()
3930             || !operation_no_trapping_overflow (type, tree_code (code)));
3931
3932   if (SAT_FIXED_POINT_TYPE_P (type))
3933     return true;
3934
3935   return false;
3936 }
3937
3938 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3939    has a handled computation expression.  Store the main reduction
3940    operation in *CODE.  */
3941
3942 static bool
3943 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3944                       tree loop_arg, code_helper *code,
3945                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3946 {
3947   auto_bitmap visited;
3948   tree lookfor = PHI_RESULT (phi);
3949   ssa_op_iter curri;
3950   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3951   while (USE_FROM_PTR (curr) != loop_arg)
3952     curr = op_iter_next_use (&curri);
3953   curri.i = curri.numops;
3954   do
3955     {
3956       path.safe_push (std::make_pair (curri, curr));
3957       tree use = USE_FROM_PTR (curr);
3958       if (use == lookfor)
3959         break;
3960       gimple *def = SSA_NAME_DEF_STMT (use);
3961       if (gimple_nop_p (def)
3962           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3963         {
3964 pop:
3965           do
3966             {
3967               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3968               curri = x.first;
3969               curr = x.second;
3970               do
3971                 curr = op_iter_next_use (&curri);
3972               /* Skip already visited or non-SSA operands (from iterating
3973                  over PHI args).  */
3974               while (curr != NULL_USE_OPERAND_P
3975                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3976                          || ! bitmap_set_bit (visited,
3977                                               SSA_NAME_VERSION
3978                                                 (USE_FROM_PTR (curr)))));
3979             }
3980           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3981           if (curr == NULL_USE_OPERAND_P)
3982             break;
3983         }
3984       else
3985         {
3986           if (gimple_code (def) == GIMPLE_PHI)
3987             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3988           else
3989             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3990           while (curr != NULL_USE_OPERAND_P
3991                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3992                      || ! bitmap_set_bit (visited,
3993                                           SSA_NAME_VERSION
3994                                             (USE_FROM_PTR (curr)))))
3995             curr = op_iter_next_use (&curri);
3996           if (curr == NULL_USE_OPERAND_P)
3997             goto pop;
3998         }
3999     }
4000   while (1);
4001   if (dump_file && (dump_flags & TDF_DETAILS))
4002     {
4003       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4004       unsigned i;
4005       std::pair<ssa_op_iter, use_operand_p> *x;
4006       FOR_EACH_VEC_ELT (path, i, x)
4007         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4008       dump_printf (MSG_NOTE, "\n");
4009     }
4010
4011   /* Check whether the reduction path detected is valid.  */
4012   bool fail = path.length () == 0;
4013   bool neg = false;
4014   int sign = -1;
4015   *code = ERROR_MARK;
4016   for (unsigned i = 1; i < path.length (); ++i)
4017     {
4018       gimple *use_stmt = USE_STMT (path[i].second);
4019       gimple_match_op op;
4020       if (!gimple_extract_op (use_stmt, &op))
4021         {
4022           fail = true;
4023           break;
4024         }
4025       unsigned int opi = op.num_ops;
4026       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4027         {
4028           /* The following make sure we can compute the operand index
4029              easily plus it mostly disallows chaining via COND_EXPR condition
4030              operands.  */
4031           for (opi = 0; opi < op.num_ops; ++opi)
4032             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4033               break;
4034         }
4035       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4036         {
4037           for (opi = 0; opi < op.num_ops; ++opi)
4038             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4039               break;
4040         }
4041       if (opi == op.num_ops)
4042         {
4043           fail = true;
4044           break;
4045         }
4046       op.code = canonicalize_code (op.code, op.type);
4047       if (op.code == MINUS_EXPR)
4048         {
4049           op.code = PLUS_EXPR;
4050           /* Track whether we negate the reduction value each iteration.  */
4051           if (op.ops[1] == op.ops[opi])
4052             neg = ! neg;
4053         }
4054       if (CONVERT_EXPR_CODE_P (op.code)
4055           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4056         ;
4057       else if (*code == ERROR_MARK)
4058         {
4059           *code = op.code;
4060           sign = TYPE_SIGN (op.type);
4061         }
4062       else if (op.code != *code)
4063         {
4064           fail = true;
4065           break;
4066         }
4067       else if ((op.code == MIN_EXPR
4068                 || op.code == MAX_EXPR)
4069                && sign != TYPE_SIGN (op.type))
4070         {
4071           fail = true;
4072           break;
4073         }
4074       /* Check there's only a single stmt the op is used on.  For the
4075          not value-changing tail and the last stmt allow out-of-loop uses.
4076          ???  We could relax this and handle arbitrary live stmts by
4077          forcing a scalar epilogue for example.  */
4078       imm_use_iterator imm_iter;
4079       use_operand_p use_p;
4080       gimple *op_use_stmt;
4081       unsigned cnt = 0;
4082       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4083         if (!is_gimple_debug (op_use_stmt)
4084             && (*code != ERROR_MARK
4085                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
4086           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4087             cnt++;
4088       if (cnt != 1)
4089         {
4090           fail = true;
4091           break;
4092         }
4093     }
4094   return ! fail && ! neg && *code != ERROR_MARK;
4095 }
4096
4097 bool
4098 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4099                       tree loop_arg, enum tree_code code)
4100 {
4101   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4102   code_helper code_;
4103   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4104           && code_ == code);
4105 }
4106
4107
4108
4109 /* Function vect_is_simple_reduction
4110
4111    (1) Detect a cross-iteration def-use cycle that represents a simple
4112    reduction computation.  We look for the following pattern:
4113
4114    loop_header:
4115      a1 = phi < a0, a2 >
4116      a3 = ...
4117      a2 = operation (a3, a1)
4118
4119    or
4120
4121    a3 = ...
4122    loop_header:
4123      a1 = phi < a0, a2 >
4124      a2 = operation (a3, a1)
4125
4126    such that:
4127    1. operation is commutative and associative and it is safe to
4128       change the order of the computation
4129    2. no uses for a2 in the loop (a2 is used out of the loop)
4130    3. no uses of a1 in the loop besides the reduction operation
4131    4. no uses of a1 outside the loop.
4132
4133    Conditions 1,4 are tested here.
4134    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4135
4136    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4137    nested cycles.
4138
4139    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4140    reductions:
4141
4142      a1 = phi < a0, a2 >
4143      inner loop (def of a3)
4144      a2 = phi < a3 >
4145
4146    (4) Detect condition expressions, ie:
4147      for (int i = 0; i < N; i++)
4148        if (a[i] < val)
4149         ret_val = a[i];
4150
4151 */
4152
4153 static stmt_vec_info
4154 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4155                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4156 {
4157   gphi *phi = as_a <gphi *> (phi_info->stmt);
4158   gimple *phi_use_stmt = NULL;
4159   imm_use_iterator imm_iter;
4160   use_operand_p use_p;
4161
4162   *double_reduc = false;
4163   *reduc_chain_p = false;
4164   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4165
4166   tree phi_name = PHI_RESULT (phi);
4167   /* ???  If there are no uses of the PHI result the inner loop reduction
4168      won't be detected as possibly double-reduction by vectorizable_reduction
4169      because that tries to walk the PHI arg from the preheader edge which
4170      can be constant.  See PR60382.  */
4171   if (has_zero_uses (phi_name))
4172     return NULL;
4173   class loop *loop = (gimple_bb (phi))->loop_father;
4174   unsigned nphi_def_loop_uses = 0;
4175   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4176     {
4177       gimple *use_stmt = USE_STMT (use_p);
4178       if (is_gimple_debug (use_stmt))
4179         continue;
4180
4181       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4182         {
4183           if (dump_enabled_p ())
4184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4185                              "intermediate value used outside loop.\n");
4186
4187           return NULL;
4188         }
4189
4190       nphi_def_loop_uses++;
4191       phi_use_stmt = use_stmt;
4192     }
4193
4194   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4195   if (TREE_CODE (latch_def) != SSA_NAME)
4196     {
4197       if (dump_enabled_p ())
4198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199                          "reduction: not ssa_name: %T\n", latch_def);
4200       return NULL;
4201     }
4202
4203   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4204   if (!def_stmt_info
4205       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4206     return NULL;
4207
4208   bool nested_in_vect_loop
4209     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4210   unsigned nlatch_def_loop_uses = 0;
4211   auto_vec<gphi *, 3> lcphis;
4212   bool inner_loop_of_double_reduc = false;
4213   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4214     {
4215       gimple *use_stmt = USE_STMT (use_p);
4216       if (is_gimple_debug (use_stmt))
4217         continue;
4218       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4219         nlatch_def_loop_uses++;
4220       else
4221         {
4222           /* We can have more than one loop-closed PHI.  */
4223           lcphis.safe_push (as_a <gphi *> (use_stmt));
4224           if (nested_in_vect_loop
4225               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4226                   == vect_double_reduction_def))
4227             inner_loop_of_double_reduc = true;
4228         }
4229     }
4230
4231   /* If we are vectorizing an inner reduction we are executing that
4232      in the original order only in case we are not dealing with a
4233      double reduction.  */
4234   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4235     {
4236       if (dump_enabled_p ())
4237         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4238                         "detected nested cycle: ");
4239       return def_stmt_info;
4240     }
4241
4242   /* When the inner loop of a double reduction ends up with more than
4243      one loop-closed PHI we have failed to classify alternate such
4244      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4245   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4246     {
4247       if (dump_enabled_p ())
4248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4249                          "unhandle double reduction\n");
4250       return NULL;
4251     }
4252
4253   /* If this isn't a nested cycle or if the nested cycle reduction value
4254      is used ouside of the inner loop we cannot handle uses of the reduction
4255      value.  */
4256   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4257     {
4258       if (dump_enabled_p ())
4259         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4260                          "reduction used in loop.\n");
4261       return NULL;
4262     }
4263
4264   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4265      defined in the inner loop.  */
4266   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4267     {
4268       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4269       if (gimple_phi_num_args (def_stmt) != 1
4270           || TREE_CODE (op1) != SSA_NAME)
4271         {
4272           if (dump_enabled_p ())
4273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4274                              "unsupported phi node definition.\n");
4275
4276           return NULL;
4277         }
4278
4279       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4280          and the latch definition op1.  */
4281       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4282       if (gimple_bb (def1)
4283           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4284           && loop->inner
4285           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4286           && (is_gimple_assign (def1) || is_gimple_call (def1))
4287           && is_a <gphi *> (phi_use_stmt)
4288           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4289           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4290                                             loop_latch_edge (loop->inner))))
4291         {
4292           if (dump_enabled_p ())
4293             report_vect_op (MSG_NOTE, def_stmt,
4294                             "detected double reduction: ");
4295
4296           *double_reduc = true;
4297           return def_stmt_info;
4298         }
4299
4300       return NULL;
4301     }
4302
4303   /* Look for the expression computing latch_def from then loop PHI result.  */
4304   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4305   code_helper code;
4306   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4307                             path))
4308     {
4309       STMT_VINFO_REDUC_CODE (phi_info) = code;
4310       if (code == COND_EXPR && !nested_in_vect_loop)
4311         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4312
4313       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4314          reduction chain for which the additional restriction is that
4315          all operations in the chain are the same.  */
4316       auto_vec<stmt_vec_info, 8> reduc_chain;
4317       unsigned i;
4318       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4319       for (i = path.length () - 1; i >= 1; --i)
4320         {
4321           gimple *stmt = USE_STMT (path[i].second);
4322           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4323           gimple_match_op op;
4324           if (!gimple_extract_op (stmt, &op))
4325             gcc_unreachable ();
4326           if (gassign *assign = dyn_cast<gassign *> (stmt))
4327             STMT_VINFO_REDUC_IDX (stmt_info)
4328               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4329           else
4330             {
4331               gcall *call = as_a<gcall *> (stmt);
4332               STMT_VINFO_REDUC_IDX (stmt_info)
4333                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4334             }
4335           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4336                                      && (i == 1 || i == path.length () - 1));
4337           if ((op.code != code && !leading_conversion)
4338               /* We can only handle the final value in epilogue
4339                  generation for reduction chains.  */
4340               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4341             is_slp_reduc = false;
4342           /* For reduction chains we support a trailing/leading
4343              conversions.  We do not store those in the actual chain.  */
4344           if (leading_conversion)
4345             continue;
4346           reduc_chain.safe_push (stmt_info);
4347         }
4348       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4349         {
4350           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4351             {
4352               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4353               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4354             }
4355           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4356           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4357
4358           /* Save the chain for further analysis in SLP detection.  */
4359           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4360           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4361
4362           *reduc_chain_p = true;
4363           if (dump_enabled_p ())
4364             dump_printf_loc (MSG_NOTE, vect_location,
4365                             "reduction: detected reduction chain\n");
4366         }
4367       else if (dump_enabled_p ())
4368         dump_printf_loc (MSG_NOTE, vect_location,
4369                          "reduction: detected reduction\n");
4370
4371       return def_stmt_info;
4372     }
4373
4374   if (dump_enabled_p ())
4375     dump_printf_loc (MSG_NOTE, vect_location,
4376                      "reduction: unknown pattern\n");
4377
4378   return NULL;
4379 }
4380
4381 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4382    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4383    or -1 if not known.  */
4384
4385 static int
4386 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4387 {
4388   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4389   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4390     {
4391       if (dump_enabled_p ())
4392         dump_printf_loc (MSG_NOTE, vect_location,
4393                          "cost model: epilogue peel iters set to vf/2 "
4394                          "because loop iterations are unknown .\n");
4395       return assumed_vf / 2;
4396     }
4397   else
4398     {
4399       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4400       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4401       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4402       /* If we need to peel for gaps, but no peeling is required, we have to
4403          peel VF iterations.  */
4404       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4405         peel_iters_epilogue = assumed_vf;
4406       return peel_iters_epilogue;
4407     }
4408 }
4409
4410 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4411 int
4412 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4413                              int *peel_iters_epilogue,
4414                              stmt_vector_for_cost *scalar_cost_vec,
4415                              stmt_vector_for_cost *prologue_cost_vec,
4416                              stmt_vector_for_cost *epilogue_cost_vec)
4417 {
4418   int retval = 0;
4419
4420   *peel_iters_epilogue
4421     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4422
4423   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4424     {
4425       /* If peeled iterations are known but number of scalar loop
4426          iterations are unknown, count a taken branch per peeled loop.  */
4427       if (peel_iters_prologue > 0)
4428         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4429                                    vect_prologue);
4430       if (*peel_iters_epilogue > 0)
4431         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4432                                     vect_epilogue);
4433     }
4434
4435   stmt_info_for_cost *si;
4436   int j;
4437   if (peel_iters_prologue)
4438     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4439       retval += record_stmt_cost (prologue_cost_vec,
4440                                   si->count * peel_iters_prologue,
4441                                   si->kind, si->stmt_info, si->misalign,
4442                                   vect_prologue);
4443   if (*peel_iters_epilogue)
4444     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4445       retval += record_stmt_cost (epilogue_cost_vec,
4446                                   si->count * *peel_iters_epilogue,
4447                                   si->kind, si->stmt_info, si->misalign,
4448                                   vect_epilogue);
4449
4450   return retval;
4451 }
4452
4453 /* Function vect_estimate_min_profitable_iters
4454
4455    Return the number of iterations required for the vector version of the
4456    loop to be profitable relative to the cost of the scalar version of the
4457    loop.
4458
4459    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4460    of iterations for vectorization.  -1 value means loop vectorization
4461    is not profitable.  This returned value may be used for dynamic
4462    profitability check.
4463
4464    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4465    for static check against estimated number of iterations.  */
4466
4467 static void
4468 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4469                                     int *ret_min_profitable_niters,
4470                                     int *ret_min_profitable_estimate,
4471                                     unsigned *suggested_unroll_factor)
4472 {
4473   int min_profitable_iters;
4474   int min_profitable_estimate;
4475   int peel_iters_prologue;
4476   int peel_iters_epilogue;
4477   unsigned vec_inside_cost = 0;
4478   int vec_outside_cost = 0;
4479   unsigned vec_prologue_cost = 0;
4480   unsigned vec_epilogue_cost = 0;
4481   int scalar_single_iter_cost = 0;
4482   int scalar_outside_cost = 0;
4483   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4484   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4485   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4486
4487   /* Cost model disabled.  */
4488   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4489     {
4490       if (dump_enabled_p ())
4491         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4492       *ret_min_profitable_niters = 0;
4493       *ret_min_profitable_estimate = 0;
4494       return;
4495     }
4496
4497   /* Requires loop versioning tests to handle misalignment.  */
4498   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4499     {
4500       /*  FIXME: Make cost depend on complexity of individual check.  */
4501       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4502       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4503       if (dump_enabled_p ())
4504         dump_printf (MSG_NOTE,
4505                      "cost model: Adding cost of checks for loop "
4506                      "versioning to treat misalignment.\n");
4507     }
4508
4509   /* Requires loop versioning with alias checks.  */
4510   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4511     {
4512       /*  FIXME: Make cost depend on complexity of individual check.  */
4513       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4514       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4515       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4516       if (len)
4517         /* Count LEN - 1 ANDs and LEN comparisons.  */
4518         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4519                               scalar_stmt, vect_prologue);
4520       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4521       if (len)
4522         {
4523           /* Count LEN - 1 ANDs and LEN comparisons.  */
4524           unsigned int nstmts = len * 2 - 1;
4525           /* +1 for each bias that needs adding.  */
4526           for (unsigned int i = 0; i < len; ++i)
4527             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4528               nstmts += 1;
4529           (void) add_stmt_cost (target_cost_data, nstmts,
4530                                 scalar_stmt, vect_prologue);
4531         }
4532       if (dump_enabled_p ())
4533         dump_printf (MSG_NOTE,
4534                      "cost model: Adding cost of checks for loop "
4535                      "versioning aliasing.\n");
4536     }
4537
4538   /* Requires loop versioning with niter checks.  */
4539   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4540     {
4541       /*  FIXME: Make cost depend on complexity of individual check.  */
4542       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4543                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4544       if (dump_enabled_p ())
4545         dump_printf (MSG_NOTE,
4546                      "cost model: Adding cost of checks for loop "
4547                      "versioning niters.\n");
4548     }
4549
4550   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4551     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4552                           vect_prologue);
4553
4554   /* Count statements in scalar loop.  Using this as scalar cost for a single
4555      iteration for now.
4556
4557      TODO: Add outer loop support.
4558
4559      TODO: Consider assigning different costs to different scalar
4560      statements.  */
4561
4562   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4563
4564   /* Add additional cost for the peeled instructions in prologue and epilogue
4565      loop.  (For fully-masked loops there will be no peeling.)
4566
4567      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4568      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4569
4570      TODO: Build an expression that represents peel_iters for prologue and
4571      epilogue to be used in a run-time test.  */
4572
4573   bool prologue_need_br_taken_cost = false;
4574   bool prologue_need_br_not_taken_cost = false;
4575
4576   /* Calculate peel_iters_prologue.  */
4577   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4578     peel_iters_prologue = 0;
4579   else if (npeel < 0)
4580     {
4581       peel_iters_prologue = assumed_vf / 2;
4582       if (dump_enabled_p ())
4583         dump_printf (MSG_NOTE, "cost model: "
4584                      "prologue peel iters set to vf/2.\n");
4585
4586       /* If peeled iterations are unknown, count a taken branch and a not taken
4587          branch per peeled loop.  Even if scalar loop iterations are known,
4588          vector iterations are not known since peeled prologue iterations are
4589          not known.  Hence guards remain the same.  */
4590       prologue_need_br_taken_cost = true;
4591       prologue_need_br_not_taken_cost = true;
4592     }
4593   else
4594     {
4595       peel_iters_prologue = npeel;
4596       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4597         /* If peeled iterations are known but number of scalar loop
4598            iterations are unknown, count a taken branch per peeled loop.  */
4599         prologue_need_br_taken_cost = true;
4600     }
4601
4602   bool epilogue_need_br_taken_cost = false;
4603   bool epilogue_need_br_not_taken_cost = false;
4604
4605   /* Calculate peel_iters_epilogue.  */
4606   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4607     /* We need to peel exactly one iteration for gaps.  */
4608     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4609   else if (npeel < 0)
4610     {
4611       /* If peeling for alignment is unknown, loop bound of main loop
4612          becomes unknown.  */
4613       peel_iters_epilogue = assumed_vf / 2;
4614       if (dump_enabled_p ())
4615         dump_printf (MSG_NOTE, "cost model: "
4616                      "epilogue peel iters set to vf/2 because "
4617                      "peeling for alignment is unknown.\n");
4618
4619       /* See the same reason above in peel_iters_prologue calculation.  */
4620       epilogue_need_br_taken_cost = true;
4621       epilogue_need_br_not_taken_cost = true;
4622     }
4623   else
4624     {
4625       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4626       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4627         /* If peeled iterations are known but number of scalar loop
4628            iterations are unknown, count a taken branch per peeled loop.  */
4629         epilogue_need_br_taken_cost = true;
4630     }
4631
4632   stmt_info_for_cost *si;
4633   int j;
4634   /* Add costs associated with peel_iters_prologue.  */
4635   if (peel_iters_prologue)
4636     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4637       {
4638         (void) add_stmt_cost (target_cost_data,
4639                               si->count * peel_iters_prologue, si->kind,
4640                               si->stmt_info, si->node, si->vectype,
4641                               si->misalign, vect_prologue);
4642       }
4643
4644   /* Add costs associated with peel_iters_epilogue.  */
4645   if (peel_iters_epilogue)
4646     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4647       {
4648         (void) add_stmt_cost (target_cost_data,
4649                               si->count * peel_iters_epilogue, si->kind,
4650                               si->stmt_info, si->node, si->vectype,
4651                               si->misalign, vect_epilogue);
4652       }
4653
4654   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4655
4656   if (prologue_need_br_taken_cost)
4657     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4658                           vect_prologue);
4659
4660   if (prologue_need_br_not_taken_cost)
4661     (void) add_stmt_cost (target_cost_data, 1,
4662                           cond_branch_not_taken, vect_prologue);
4663
4664   if (epilogue_need_br_taken_cost)
4665     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4666                           vect_epilogue);
4667
4668   if (epilogue_need_br_not_taken_cost)
4669     (void) add_stmt_cost (target_cost_data, 1,
4670                           cond_branch_not_taken, vect_epilogue);
4671
4672   /* Take care of special costs for rgroup controls of partial vectors.  */
4673   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4674       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4675           == vect_partial_vectors_avx512))
4676     {
4677       /* Calculate how many masks we need to generate.  */
4678       unsigned int num_masks = 0;
4679       bool need_saturation = false;
4680       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4681         if (rgm.type)
4682           {
4683             unsigned nvectors = rgm.factor;
4684             num_masks += nvectors;
4685             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4686                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4687               need_saturation = true;
4688           }
4689
4690       /* ???  The target isn't able to identify the costs below as
4691          producing masks so it cannot penaltize cases where we'd run
4692          out of mask registers for example.  */
4693
4694       /* ???  We are also failing to account for smaller vector masks
4695          we generate by splitting larger masks in vect_get_loop_mask.  */
4696
4697       /* In the worst case, we need to generate each mask in the prologue
4698          and in the loop body.  We need one splat per group and one
4699          compare per mask.
4700
4701          Sometimes the prologue mask will fold to a constant,
4702          so the actual prologue cost might be smaller.  However, it's
4703          simpler and safer to use the worst-case cost; if this ends up
4704          being the tie-breaker between vectorizing or not, then it's
4705          probably better not to vectorize.  */
4706       (void) add_stmt_cost (target_cost_data,
4707                             num_masks
4708                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4709                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4710                             vect_prologue);
4711       (void) add_stmt_cost (target_cost_data,
4712                             num_masks
4713                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4714                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4715
4716       /* When we need saturation we need it both in the prologue and
4717          the epilogue.  */
4718       if (need_saturation)
4719         {
4720           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4721                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4722           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4723                                 NULL, NULL, NULL_TREE, 0, vect_body);
4724         }
4725     }
4726   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4727            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4728                == vect_partial_vectors_while_ult))
4729     {
4730       /* Calculate how many masks we need to generate.  */
4731       unsigned int num_masks = 0;
4732       rgroup_controls *rgm;
4733       unsigned int num_vectors_m1;
4734       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4735                         num_vectors_m1, rgm)
4736         if (rgm->type)
4737           num_masks += num_vectors_m1 + 1;
4738       gcc_assert (num_masks > 0);
4739
4740       /* In the worst case, we need to generate each mask in the prologue
4741          and in the loop body.  One of the loop body mask instructions
4742          replaces the comparison in the scalar loop, and since we don't
4743          count the scalar comparison against the scalar body, we shouldn't
4744          count that vector instruction against the vector body either.
4745
4746          Sometimes we can use unpacks instead of generating prologue
4747          masks and sometimes the prologue mask will fold to a constant,
4748          so the actual prologue cost might be smaller.  However, it's
4749          simpler and safer to use the worst-case cost; if this ends up
4750          being the tie-breaker between vectorizing or not, then it's
4751          probably better not to vectorize.  */
4752       (void) add_stmt_cost (target_cost_data, num_masks,
4753                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4754                             vect_prologue);
4755       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4756                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4757                             vect_body);
4758     }
4759   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4760     {
4761       /* Referring to the functions vect_set_loop_condition_partial_vectors
4762          and vect_set_loop_controls_directly, we need to generate each
4763          length in the prologue and in the loop body if required. Although
4764          there are some possible optimizations, we consider the worst case
4765          here.  */
4766
4767       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4768       signed char partial_load_store_bias
4769         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4770       bool need_iterate_p
4771         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4772            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4773
4774       /* Calculate how many statements to be added.  */
4775       unsigned int prologue_stmts = 0;
4776       unsigned int body_stmts = 0;
4777
4778       rgroup_controls *rgc;
4779       unsigned int num_vectors_m1;
4780       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4781         if (rgc->type)
4782           {
4783             /* May need one SHIFT for nitems_total computation.  */
4784             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4785             if (nitems != 1 && !niters_known_p)
4786               prologue_stmts += 1;
4787
4788             /* May need one MAX and one MINUS for wrap around.  */
4789             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4790               prologue_stmts += 2;
4791
4792             /* Need one MAX and one MINUS for each batch limit excepting for
4793                the 1st one.  */
4794             prologue_stmts += num_vectors_m1 * 2;
4795
4796             unsigned int num_vectors = num_vectors_m1 + 1;
4797
4798             /* Need to set up lengths in prologue, only one MIN required
4799                for each since start index is zero.  */
4800             prologue_stmts += num_vectors;
4801
4802             /* If we have a non-zero partial load bias, we need one PLUS
4803                to adjust the load length.  */
4804             if (partial_load_store_bias != 0)
4805               body_stmts += 1;
4806
4807             /* Each may need two MINs and one MINUS to update lengths in body
4808                for next iteration.  */
4809             if (need_iterate_p)
4810               body_stmts += 3 * num_vectors;
4811           }
4812
4813       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4814                             scalar_stmt, vect_prologue);
4815       (void) add_stmt_cost (target_cost_data, body_stmts,
4816                             scalar_stmt, vect_body);
4817     }
4818
4819   /* FORNOW: The scalar outside cost is incremented in one of the
4820      following ways:
4821
4822      1. The vectorizer checks for alignment and aliasing and generates
4823      a condition that allows dynamic vectorization.  A cost model
4824      check is ANDED with the versioning condition.  Hence scalar code
4825      path now has the added cost of the versioning check.
4826
4827        if (cost > th & versioning_check)
4828          jmp to vector code
4829
4830      Hence run-time scalar is incremented by not-taken branch cost.
4831
4832      2. The vectorizer then checks if a prologue is required.  If the
4833      cost model check was not done before during versioning, it has to
4834      be done before the prologue check.
4835
4836        if (cost <= th)
4837          prologue = scalar_iters
4838        if (prologue == 0)
4839          jmp to vector code
4840        else
4841          execute prologue
4842        if (prologue == num_iters)
4843          go to exit
4844
4845      Hence the run-time scalar cost is incremented by a taken branch,
4846      plus a not-taken branch, plus a taken branch cost.
4847
4848      3. The vectorizer then checks if an epilogue is required.  If the
4849      cost model check was not done before during prologue check, it
4850      has to be done with the epilogue check.
4851
4852        if (prologue == 0)
4853          jmp to vector code
4854        else
4855          execute prologue
4856        if (prologue == num_iters)
4857          go to exit
4858        vector code:
4859          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4860            jmp to epilogue
4861
4862      Hence the run-time scalar cost should be incremented by 2 taken
4863      branches.
4864
4865      TODO: The back end may reorder the BBS's differently and reverse
4866      conditions/branch directions.  Change the estimates below to
4867      something more reasonable.  */
4868
4869   /* If the number of iterations is known and we do not do versioning, we can
4870      decide whether to vectorize at compile time.  Hence the scalar version
4871      do not carry cost model guard costs.  */
4872   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4873       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4874     {
4875       /* Cost model check occurs at versioning.  */
4876       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4877         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4878       else
4879         {
4880           /* Cost model check occurs at prologue generation.  */
4881           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4882             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4883               + vect_get_stmt_cost (cond_branch_not_taken);
4884           /* Cost model check occurs at epilogue generation.  */
4885           else
4886             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4887         }
4888     }
4889
4890   /* Complete the target-specific cost calculations.  */
4891   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4892                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4893                suggested_unroll_factor);
4894
4895   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4896       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4897       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4898                     *suggested_unroll_factor,
4899                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4900     {
4901       if (dump_enabled_p ())
4902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4903                          "can't unroll as unrolled vectorization factor larger"
4904                          " than maximum vectorization factor: "
4905                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4906                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4907       *suggested_unroll_factor = 1;
4908     }
4909
4910   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4911
4912   if (dump_enabled_p ())
4913     {
4914       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4915       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4916                    vec_inside_cost);
4917       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4918                    vec_prologue_cost);
4919       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4920                    vec_epilogue_cost);
4921       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4922                    scalar_single_iter_cost);
4923       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4924                    scalar_outside_cost);
4925       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4926                    vec_outside_cost);
4927       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4928                    peel_iters_prologue);
4929       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4930                    peel_iters_epilogue);
4931     }
4932
4933   /* Calculate number of iterations required to make the vector version
4934      profitable, relative to the loop bodies only.  The following condition
4935      must hold true:
4936      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4937      where
4938      SIC = scalar iteration cost, VIC = vector iteration cost,
4939      VOC = vector outside cost, VF = vectorization factor,
4940      NPEEL = prologue iterations + epilogue iterations,
4941      SOC = scalar outside cost for run time cost model check.  */
4942
4943   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4944                           - vec_inside_cost);
4945   if (saving_per_viter <= 0)
4946     {
4947       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4948         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4949                     "vectorization did not happen for a simd loop");
4950
4951       if (dump_enabled_p ())
4952         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4953                          "cost model: the vector iteration cost = %d "
4954                          "divided by the scalar iteration cost = %d "
4955                          "is greater or equal to the vectorization factor = %d"
4956                          ".\n",
4957                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4958       *ret_min_profitable_niters = -1;
4959       *ret_min_profitable_estimate = -1;
4960       return;
4961     }
4962
4963   /* ??? The "if" arm is written to handle all cases; see below for what
4964      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4965   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4966     {
4967       /* Rewriting the condition above in terms of the number of
4968          vector iterations (vniters) rather than the number of
4969          scalar iterations (niters) gives:
4970
4971          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4972
4973          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4974
4975          For integer N, X and Y when X > 0:
4976
4977          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4978       int outside_overhead = (vec_outside_cost
4979                               - scalar_single_iter_cost * peel_iters_prologue
4980                               - scalar_single_iter_cost * peel_iters_epilogue
4981                               - scalar_outside_cost);
4982       /* We're only interested in cases that require at least one
4983          vector iteration.  */
4984       int min_vec_niters = 1;
4985       if (outside_overhead > 0)
4986         min_vec_niters = outside_overhead / saving_per_viter + 1;
4987
4988       if (dump_enabled_p ())
4989         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4990                      min_vec_niters);
4991
4992       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4993         {
4994           /* Now that we know the minimum number of vector iterations,
4995              find the minimum niters for which the scalar cost is larger:
4996
4997              SIC * niters > VIC * vniters + VOC - SOC
4998
4999              We know that the minimum niters is no more than
5000              vniters * VF + NPEEL, but it might be (and often is) less
5001              than that if a partial vector iteration is cheaper than the
5002              equivalent scalar code.  */
5003           int threshold = (vec_inside_cost * min_vec_niters
5004                            + vec_outside_cost
5005                            - scalar_outside_cost);
5006           if (threshold <= 0)
5007             min_profitable_iters = 1;
5008           else
5009             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5010         }
5011       else
5012         /* Convert the number of vector iterations into a number of
5013            scalar iterations.  */
5014         min_profitable_iters = (min_vec_niters * assumed_vf
5015                                 + peel_iters_prologue
5016                                 + peel_iters_epilogue);
5017     }
5018   else
5019     {
5020       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5021                               * assumed_vf
5022                               - vec_inside_cost * peel_iters_prologue
5023                               - vec_inside_cost * peel_iters_epilogue);
5024       if (min_profitable_iters <= 0)
5025         min_profitable_iters = 0;
5026       else
5027         {
5028           min_profitable_iters /= saving_per_viter;
5029
5030           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5031               <= (((int) vec_inside_cost * min_profitable_iters)
5032                   + (((int) vec_outside_cost - scalar_outside_cost)
5033                      * assumed_vf)))
5034             min_profitable_iters++;
5035         }
5036     }
5037
5038   if (dump_enabled_p ())
5039     dump_printf (MSG_NOTE,
5040                  "  Calculated minimum iters for profitability: %d\n",
5041                  min_profitable_iters);
5042
5043   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5044       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5045     /* We want the vectorized loop to execute at least once.  */
5046     min_profitable_iters = assumed_vf + peel_iters_prologue;
5047   else if (min_profitable_iters < peel_iters_prologue)
5048     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5049        vectorized loop executes at least once.  */
5050     min_profitable_iters = peel_iters_prologue;
5051
5052   if (dump_enabled_p ())
5053     dump_printf_loc (MSG_NOTE, vect_location,
5054                      "  Runtime profitability threshold = %d\n",
5055                      min_profitable_iters);
5056
5057   *ret_min_profitable_niters = min_profitable_iters;
5058
5059   /* Calculate number of iterations required to make the vector version
5060      profitable, relative to the loop bodies only.
5061
5062      Non-vectorized variant is SIC * niters and it must win over vector
5063      variant on the expected loop trip count.  The following condition must hold true:
5064      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5065
5066   if (vec_outside_cost <= 0)
5067     min_profitable_estimate = 0;
5068   /* ??? This "else if" arm is written to handle all cases; see below for
5069      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5070   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5071     {
5072       /* This is a repeat of the code above, but with + SOC rather
5073          than - SOC.  */
5074       int outside_overhead = (vec_outside_cost
5075                               - scalar_single_iter_cost * peel_iters_prologue
5076                               - scalar_single_iter_cost * peel_iters_epilogue
5077                               + scalar_outside_cost);
5078       int min_vec_niters = 1;
5079       if (outside_overhead > 0)
5080         min_vec_niters = outside_overhead / saving_per_viter + 1;
5081
5082       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5083         {
5084           int threshold = (vec_inside_cost * min_vec_niters
5085                            + vec_outside_cost
5086                            + scalar_outside_cost);
5087           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5088         }
5089       else
5090         min_profitable_estimate = (min_vec_niters * assumed_vf
5091                                    + peel_iters_prologue
5092                                    + peel_iters_epilogue);
5093     }
5094   else
5095     {
5096       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5097                                  * assumed_vf
5098                                  - vec_inside_cost * peel_iters_prologue
5099                                  - vec_inside_cost * peel_iters_epilogue)
5100                                  / ((scalar_single_iter_cost * assumed_vf)
5101                                    - vec_inside_cost);
5102     }
5103   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5104   if (dump_enabled_p ())
5105     dump_printf_loc (MSG_NOTE, vect_location,
5106                      "  Static estimate profitability threshold = %d\n",
5107                      min_profitable_estimate);
5108
5109   *ret_min_profitable_estimate = min_profitable_estimate;
5110 }
5111
5112 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5113    vector elements (not bits) for a vector with NELT elements.  */
5114 static void
5115 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5116                               vec_perm_builder *sel)
5117 {
5118   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5119      by vec_perm_indices.  */
5120   sel->new_vector (nelt, 1, 3);
5121   for (unsigned int i = 0; i < 3; i++)
5122     sel->quick_push (i + offset);
5123 }
5124
5125 /* Checks whether the target supports whole-vector shifts for vectors of mode
5126    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5127    it supports vec_perm_const with masks for all necessary shift amounts.  */
5128 static bool
5129 have_whole_vector_shift (machine_mode mode)
5130 {
5131   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5132     return true;
5133
5134   /* Variable-length vectors should be handled via the optab.  */
5135   unsigned int nelt;
5136   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5137     return false;
5138
5139   vec_perm_builder sel;
5140   vec_perm_indices indices;
5141   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5142     {
5143       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5144       indices.new_vector (sel, 2, nelt);
5145       if (!can_vec_perm_const_p (mode, mode, indices, false))
5146         return false;
5147     }
5148   return true;
5149 }
5150
5151 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5152    multiplication operands have differing signs and (b) we intend
5153    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5154    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5155
5156 static bool
5157 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5158                                  stmt_vec_info stmt_info)
5159 {
5160   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5161   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5162     return false;
5163
5164   tree rhs1 = gimple_assign_rhs1 (assign);
5165   tree rhs2 = gimple_assign_rhs2 (assign);
5166   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5167     return false;
5168
5169   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5170   gcc_assert (reduc_info->is_reduc_info);
5171   return !directly_supported_p (DOT_PROD_EXPR,
5172                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5173                                 optab_vector_mixed_sign);
5174 }
5175
5176 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5177    functions. Design better to avoid maintenance issues.  */
5178
5179 /* Function vect_model_reduction_cost.
5180
5181    Models cost for a reduction operation, including the vector ops
5182    generated within the strip-mine loop in some cases, the initial
5183    definition before the loop, and the epilogue code that must be generated.  */
5184
5185 static void
5186 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5187                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5188                            vect_reduction_type reduction_type,
5189                            int ncopies, stmt_vector_for_cost *cost_vec)
5190 {
5191   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5192   tree vectype;
5193   machine_mode mode;
5194   class loop *loop = NULL;
5195
5196   if (loop_vinfo)
5197     loop = LOOP_VINFO_LOOP (loop_vinfo);
5198
5199   /* Condition reductions generate two reductions in the loop.  */
5200   if (reduction_type == COND_REDUCTION)
5201     ncopies *= 2;
5202
5203   vectype = STMT_VINFO_VECTYPE (stmt_info);
5204   mode = TYPE_MODE (vectype);
5205   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5206
5207   gimple_match_op op;
5208   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5209     gcc_unreachable ();
5210
5211   bool emulated_mixed_dot_prod
5212     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5213   if (reduction_type == EXTRACT_LAST_REDUCTION)
5214     /* No extra instructions are needed in the prologue.  The loop body
5215        operations are costed in vectorizable_condition.  */
5216     inside_cost = 0;
5217   else if (reduction_type == FOLD_LEFT_REDUCTION)
5218     {
5219       /* No extra instructions needed in the prologue.  */
5220       prologue_cost = 0;
5221
5222       if (reduc_fn != IFN_LAST)
5223         /* Count one reduction-like operation per vector.  */
5224         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5225                                         stmt_info, 0, vect_body);
5226       else
5227         {
5228           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5229           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5230           inside_cost = record_stmt_cost (cost_vec, nelements,
5231                                           vec_to_scalar, stmt_info, 0,
5232                                           vect_body);
5233           inside_cost += record_stmt_cost (cost_vec, nelements,
5234                                            scalar_stmt, stmt_info, 0,
5235                                            vect_body);
5236         }
5237     }
5238   else
5239     {
5240       /* Add in the cost of the initial definitions.  */
5241       int prologue_stmts;
5242       if (reduction_type == COND_REDUCTION)
5243         /* For cond reductions we have four vectors: initial index, step,
5244            initial result of the data reduction, initial value of the index
5245            reduction.  */
5246         prologue_stmts = 4;
5247       else if (emulated_mixed_dot_prod)
5248         /* We need the initial reduction value and two invariants:
5249            one that contains the minimum signed value and one that
5250            contains half of its negative.  */
5251         prologue_stmts = 3;
5252       else
5253         prologue_stmts = 1;
5254       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5255                                          scalar_to_vec, stmt_info, 0,
5256                                          vect_prologue);
5257     }
5258
5259   /* Determine cost of epilogue code.
5260
5261      We have a reduction operator that will reduce the vector in one statement.
5262      Also requires scalar extract.  */
5263
5264   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5265     {
5266       if (reduc_fn != IFN_LAST)
5267         {
5268           if (reduction_type == COND_REDUCTION)
5269             {
5270               /* An EQ stmt and an COND_EXPR stmt.  */
5271               epilogue_cost += record_stmt_cost (cost_vec, 2,
5272                                                  vector_stmt, stmt_info, 0,
5273                                                  vect_epilogue);
5274               /* Reduction of the max index and a reduction of the found
5275                  values.  */
5276               epilogue_cost += record_stmt_cost (cost_vec, 2,
5277                                                  vec_to_scalar, stmt_info, 0,
5278                                                  vect_epilogue);
5279               /* A broadcast of the max value.  */
5280               epilogue_cost += record_stmt_cost (cost_vec, 1,
5281                                                  scalar_to_vec, stmt_info, 0,
5282                                                  vect_epilogue);
5283             }
5284           else
5285             {
5286               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5287                                                  stmt_info, 0, vect_epilogue);
5288               epilogue_cost += record_stmt_cost (cost_vec, 1,
5289                                                  vec_to_scalar, stmt_info, 0,
5290                                                  vect_epilogue);
5291             }
5292         }
5293       else if (reduction_type == COND_REDUCTION)
5294         {
5295           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5296           /* Extraction of scalar elements.  */
5297           epilogue_cost += record_stmt_cost (cost_vec,
5298                                              2 * estimated_nunits,
5299                                              vec_to_scalar, stmt_info, 0,
5300                                              vect_epilogue);
5301           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5302           epilogue_cost += record_stmt_cost (cost_vec,
5303                                              2 * estimated_nunits - 3,
5304                                              scalar_stmt, stmt_info, 0,
5305                                              vect_epilogue);
5306         }
5307       else if (reduction_type == EXTRACT_LAST_REDUCTION
5308                || reduction_type == FOLD_LEFT_REDUCTION)
5309         /* No extra instructions need in the epilogue.  */
5310         ;
5311       else
5312         {
5313           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5314           tree bitsize = TYPE_SIZE (op.type);
5315           int element_bitsize = tree_to_uhwi (bitsize);
5316           int nelements = vec_size_in_bits / element_bitsize;
5317
5318           if (op.code == COND_EXPR)
5319             op.code = MAX_EXPR;
5320
5321           /* We have a whole vector shift available.  */
5322           if (VECTOR_MODE_P (mode)
5323               && directly_supported_p (op.code, vectype)
5324               && have_whole_vector_shift (mode))
5325             {
5326               /* Final reduction via vector shifts and the reduction operator.
5327                  Also requires scalar extract.  */
5328               epilogue_cost += record_stmt_cost (cost_vec,
5329                                                  exact_log2 (nelements) * 2,
5330                                                  vector_stmt, stmt_info, 0,
5331                                                  vect_epilogue);
5332               epilogue_cost += record_stmt_cost (cost_vec, 1,
5333                                                  vec_to_scalar, stmt_info, 0,
5334                                                  vect_epilogue);
5335             }
5336           else
5337             /* Use extracts and reduction op for final reduction.  For N
5338                elements, we have N extracts and N-1 reduction ops.  */
5339             epilogue_cost += record_stmt_cost (cost_vec,
5340                                                nelements + nelements - 1,
5341                                                vector_stmt, stmt_info, 0,
5342                                                vect_epilogue);
5343         }
5344     }
5345
5346   if (dump_enabled_p ())
5347     dump_printf (MSG_NOTE,
5348                  "vect_model_reduction_cost: inside_cost = %d, "
5349                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5350                  prologue_cost, epilogue_cost);
5351 }
5352
5353 /* SEQ is a sequence of instructions that initialize the reduction
5354    described by REDUC_INFO.  Emit them in the appropriate place.  */
5355
5356 static void
5357 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5358                                 stmt_vec_info reduc_info, gimple *seq)
5359 {
5360   if (reduc_info->reused_accumulator)
5361     {
5362       /* When reusing an accumulator from the main loop, we only need
5363          initialization instructions if the main loop can be skipped.
5364          In that case, emit the initialization instructions at the end
5365          of the guard block that does the skip.  */
5366       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5367       gcc_assert (skip_edge);
5368       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5369       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5370     }
5371   else
5372     {
5373       /* The normal case: emit the initialization instructions on the
5374          preheader edge.  */
5375       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5376       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5377     }
5378 }
5379
5380 /* Function get_initial_def_for_reduction
5381
5382    Input:
5383    REDUC_INFO - the info_for_reduction
5384    INIT_VAL - the initial value of the reduction variable
5385    NEUTRAL_OP - a value that has no effect on the reduction, as per
5386                 neutral_op_for_reduction
5387
5388    Output:
5389    Return a vector variable, initialized according to the operation that
5390         STMT_VINFO performs. This vector will be used as the initial value
5391         of the vector of partial results.
5392
5393    The value we need is a vector in which element 0 has value INIT_VAL
5394    and every other element has value NEUTRAL_OP.  */
5395
5396 static tree
5397 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5398                                stmt_vec_info reduc_info,
5399                                tree init_val, tree neutral_op)
5400 {
5401   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5402   tree scalar_type = TREE_TYPE (init_val);
5403   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5404   tree init_def;
5405   gimple_seq stmts = NULL;
5406
5407   gcc_assert (vectype);
5408
5409   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5410               || SCALAR_FLOAT_TYPE_P (scalar_type));
5411
5412   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5413               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5414
5415   if (operand_equal_p (init_val, neutral_op))
5416     {
5417       /* If both elements are equal then the vector described above is
5418          just a splat.  */
5419       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5420       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5421     }
5422   else
5423     {
5424       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5425       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5426       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5427         {
5428           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5429              element 0.  */
5430           init_def = gimple_build_vector_from_val (&stmts, vectype,
5431                                                    neutral_op);
5432           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5433                                    vectype, init_def, init_val);
5434         }
5435       else
5436         {
5437           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5438           tree_vector_builder elts (vectype, 1, 2);
5439           elts.quick_push (init_val);
5440           elts.quick_push (neutral_op);
5441           init_def = gimple_build_vector (&stmts, &elts);
5442         }
5443     }
5444
5445   if (stmts)
5446     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5447   return init_def;
5448 }
5449
5450 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5451    which performs a reduction involving GROUP_SIZE scalar statements.
5452    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5453    is nonnull, introducing extra elements of that value will not change the
5454    result.  */
5455
5456 static void
5457 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5458                                 stmt_vec_info reduc_info,
5459                                 vec<tree> *vec_oprnds,
5460                                 unsigned int number_of_vectors,
5461                                 unsigned int group_size, tree neutral_op)
5462 {
5463   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5464   unsigned HOST_WIDE_INT nunits;
5465   unsigned j, number_of_places_left_in_vector;
5466   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5467   unsigned int i;
5468
5469   gcc_assert (group_size == initial_values.length () || neutral_op);
5470
5471   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5472      created vectors. It is greater than 1 if unrolling is performed.
5473
5474      For example, we have two scalar operands, s1 and s2 (e.g., group of
5475      strided accesses of size two), while NUNITS is four (i.e., four scalars
5476      of this type can be packed in a vector).  The output vector will contain
5477      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5478      will be 2).
5479
5480      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5481      vectors containing the operands.
5482
5483      For example, NUNITS is four as before, and the group size is 8
5484      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5485      {s5, s6, s7, s8}.  */
5486
5487   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5488     nunits = group_size;
5489
5490   number_of_places_left_in_vector = nunits;
5491   bool constant_p = true;
5492   tree_vector_builder elts (vector_type, nunits, 1);
5493   elts.quick_grow (nunits);
5494   gimple_seq ctor_seq = NULL;
5495   for (j = 0; j < nunits * number_of_vectors; ++j)
5496     {
5497       tree op;
5498       i = j % group_size;
5499
5500       /* Get the def before the loop.  In reduction chain we have only
5501          one initial value.  Else we have as many as PHIs in the group.  */
5502       if (i >= initial_values.length () || (j > i && neutral_op))
5503         op = neutral_op;
5504       else
5505         op = initial_values[i];
5506
5507       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5508       number_of_places_left_in_vector--;
5509       elts[nunits - number_of_places_left_in_vector - 1] = op;
5510       if (!CONSTANT_CLASS_P (op))
5511         constant_p = false;
5512
5513       if (number_of_places_left_in_vector == 0)
5514         {
5515           tree init;
5516           if (constant_p && !neutral_op
5517               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5518               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5519             /* Build the vector directly from ELTS.  */
5520             init = gimple_build_vector (&ctor_seq, &elts);
5521           else if (neutral_op)
5522             {
5523               /* Build a vector of the neutral value and shift the
5524                  other elements into place.  */
5525               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5526                                                    neutral_op);
5527               int k = nunits;
5528               while (k > 0 && elts[k - 1] == neutral_op)
5529                 k -= 1;
5530               while (k > 0)
5531                 {
5532                   k -= 1;
5533                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5534                                        vector_type, init, elts[k]);
5535                 }
5536             }
5537           else
5538             {
5539               /* First time round, duplicate ELTS to fill the
5540                  required number of vectors.  */
5541               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5542                                         elts, number_of_vectors, *vec_oprnds);
5543               break;
5544             }
5545           vec_oprnds->quick_push (init);
5546
5547           number_of_places_left_in_vector = nunits;
5548           elts.new_vector (vector_type, nunits, 1);
5549           elts.quick_grow (nunits);
5550           constant_p = true;
5551         }
5552     }
5553   if (ctor_seq != NULL)
5554     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5555 }
5556
5557 /* For a statement STMT_INFO taking part in a reduction operation return
5558    the stmt_vec_info the meta information is stored on.  */
5559
5560 stmt_vec_info
5561 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5562 {
5563   stmt_info = vect_orig_stmt (stmt_info);
5564   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5565   if (!is_a <gphi *> (stmt_info->stmt)
5566       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5567     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5568   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5569   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5570     {
5571       if (gimple_phi_num_args (phi) == 1)
5572         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5573     }
5574   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5575     {
5576       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5577       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5578         stmt_info = info;
5579     }
5580   return stmt_info;
5581 }
5582
5583 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5584    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5585    return false.  */
5586
5587 static bool
5588 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5589                                 stmt_vec_info reduc_info)
5590 {
5591   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5592   if (!main_loop_vinfo)
5593     return false;
5594
5595   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5596     return false;
5597
5598   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5599   auto_vec<tree, 16> main_loop_results (num_phis);
5600   auto_vec<tree, 16> initial_values (num_phis);
5601   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5602     {
5603       /* The epilogue loop can be entered either from the main loop or
5604          from an earlier guard block.  */
5605       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5606       for (tree incoming_value : reduc_info->reduc_initial_values)
5607         {
5608           /* Look for:
5609
5610                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5611                                     INITIAL_VALUE(guard block)>.  */
5612           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5613
5614           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5615           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5616
5617           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5618           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5619
5620           main_loop_results.quick_push (from_main_loop);
5621           initial_values.quick_push (from_skip);
5622         }
5623     }
5624   else
5625     /* The main loop dominates the epilogue loop.  */
5626     main_loop_results.splice (reduc_info->reduc_initial_values);
5627
5628   /* See if the main loop has the kind of accumulator we need.  */
5629   vect_reusable_accumulator *accumulator
5630     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5631   if (!accumulator
5632       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5633       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5634                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5635     return false;
5636
5637   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5638   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5639   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5640   unsigned HOST_WIDE_INT m;
5641   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5642                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5643     return false;
5644   /* Check the intermediate vector types and operations are available.  */
5645   tree prev_vectype = old_vectype;
5646   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5647   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5648     {
5649       intermediate_nunits = exact_div (intermediate_nunits, 2);
5650       tree intermediate_vectype = get_related_vectype_for_scalar_type
5651         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5652       if (!intermediate_vectype
5653           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5654                                     intermediate_vectype)
5655           || !can_vec_extract (TYPE_MODE (prev_vectype),
5656                                TYPE_MODE (intermediate_vectype)))
5657         return false;
5658       prev_vectype = intermediate_vectype;
5659     }
5660
5661   /* Non-SLP reductions might apply an adjustment after the reduction
5662      operation, in order to simplify the initialization of the accumulator.
5663      If the epilogue loop carries on from where the main loop left off,
5664      it should apply the same adjustment to the final reduction result.
5665
5666      If the epilogue loop can also be entered directly (rather than via
5667      the main loop), we need to be able to handle that case in the same way,
5668      with the same adjustment.  (In principle we could add a PHI node
5669      to select the correct adjustment, but in practice that shouldn't be
5670      necessary.)  */
5671   tree main_adjustment
5672     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5673   if (loop_vinfo->main_loop_edge && main_adjustment)
5674     {
5675       gcc_assert (num_phis == 1);
5676       tree initial_value = initial_values[0];
5677       /* Check that we can use INITIAL_VALUE as the adjustment and
5678          initialize the accumulator with a neutral value instead.  */
5679       if (!operand_equal_p (initial_value, main_adjustment))
5680         return false;
5681       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5682       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5683                                                     code, initial_value);
5684     }
5685   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5686   reduc_info->reduc_initial_values.truncate (0);
5687   reduc_info->reduc_initial_values.splice (initial_values);
5688   reduc_info->reused_accumulator = accumulator;
5689   return true;
5690 }
5691
5692 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5693    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5694
5695 static tree
5696 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5697                             gimple_seq *seq)
5698 {
5699   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5700   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5701   tree stype = TREE_TYPE (vectype);
5702   tree new_temp = vec_def;
5703   while (nunits > nunits1)
5704     {
5705       nunits /= 2;
5706       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5707                                                            stype, nunits);
5708       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5709
5710       /* The target has to make sure we support lowpart/highpart
5711          extraction, either via direct vector extract or through
5712          an integer mode punning.  */
5713       tree dst1, dst2;
5714       gimple *epilog_stmt;
5715       if (convert_optab_handler (vec_extract_optab,
5716                                  TYPE_MODE (TREE_TYPE (new_temp)),
5717                                  TYPE_MODE (vectype1))
5718           != CODE_FOR_nothing)
5719         {
5720           /* Extract sub-vectors directly once vec_extract becomes
5721              a conversion optab.  */
5722           dst1 = make_ssa_name (vectype1);
5723           epilog_stmt
5724               = gimple_build_assign (dst1, BIT_FIELD_REF,
5725                                      build3 (BIT_FIELD_REF, vectype1,
5726                                              new_temp, TYPE_SIZE (vectype1),
5727                                              bitsize_int (0)));
5728           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5729           dst2 =  make_ssa_name (vectype1);
5730           epilog_stmt
5731               = gimple_build_assign (dst2, BIT_FIELD_REF,
5732                                      build3 (BIT_FIELD_REF, vectype1,
5733                                              new_temp, TYPE_SIZE (vectype1),
5734                                              bitsize_int (bitsize)));
5735           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5736         }
5737       else
5738         {
5739           /* Extract via punning to appropriately sized integer mode
5740              vector.  */
5741           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5742           tree etype = build_vector_type (eltype, 2);
5743           gcc_assert (convert_optab_handler (vec_extract_optab,
5744                                              TYPE_MODE (etype),
5745                                              TYPE_MODE (eltype))
5746                       != CODE_FOR_nothing);
5747           tree tem = make_ssa_name (etype);
5748           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5749                                              build1 (VIEW_CONVERT_EXPR,
5750                                                      etype, new_temp));
5751           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5752           new_temp = tem;
5753           tem = make_ssa_name (eltype);
5754           epilog_stmt
5755               = gimple_build_assign (tem, BIT_FIELD_REF,
5756                                      build3 (BIT_FIELD_REF, eltype,
5757                                              new_temp, TYPE_SIZE (eltype),
5758                                              bitsize_int (0)));
5759           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5760           dst1 = make_ssa_name (vectype1);
5761           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5762                                              build1 (VIEW_CONVERT_EXPR,
5763                                                      vectype1, tem));
5764           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5765           tem = make_ssa_name (eltype);
5766           epilog_stmt
5767               = gimple_build_assign (tem, BIT_FIELD_REF,
5768                                      build3 (BIT_FIELD_REF, eltype,
5769                                              new_temp, TYPE_SIZE (eltype),
5770                                              bitsize_int (bitsize)));
5771           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5772           dst2 =  make_ssa_name (vectype1);
5773           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5774                                              build1 (VIEW_CONVERT_EXPR,
5775                                                      vectype1, tem));
5776           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5777         }
5778
5779       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5780     }
5781
5782   return new_temp;
5783 }
5784
5785 /* Function vect_create_epilog_for_reduction
5786
5787    Create code at the loop-epilog to finalize the result of a reduction
5788    computation.
5789
5790    STMT_INFO is the scalar reduction stmt that is being vectorized.
5791    SLP_NODE is an SLP node containing a group of reduction statements. The
5792      first one in this group is STMT_INFO.
5793    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5794    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5795      (counting from 0)
5796
5797    This function:
5798    1. Completes the reduction def-use cycles.
5799    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5800       by calling the function specified by REDUC_FN if available, or by
5801       other means (whole-vector shifts or a scalar loop).
5802       The function also creates a new phi node at the loop exit to preserve
5803       loop-closed form, as illustrated below.
5804
5805      The flow at the entry to this function:
5806
5807         loop:
5808           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5809           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5810           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5811         loop_exit:
5812           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5813           use <s_out0>
5814           use <s_out0>
5815
5816      The above is transformed by this function into:
5817
5818         loop:
5819           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5820           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5821           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5822         loop_exit:
5823           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5824           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5825           v_out2 = reduce <v_out1>
5826           s_out3 = extract_field <v_out2, 0>
5827           s_out4 = adjust_result <s_out3>
5828           use <s_out4>
5829           use <s_out4>
5830 */
5831
5832 static void
5833 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5834                                   stmt_vec_info stmt_info,
5835                                   slp_tree slp_node,
5836                                   slp_instance slp_node_instance)
5837 {
5838   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5839   gcc_assert (reduc_info->is_reduc_info);
5840   /* For double reductions we need to get at the inner loop reduction
5841      stmt which has the meta info attached.  Our stmt_info is that of the
5842      loop-closed PHI of the inner loop which we remember as
5843      def for the reduction PHI generation.  */
5844   bool double_reduc = false;
5845   stmt_vec_info rdef_info = stmt_info;
5846   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5847     {
5848       gcc_assert (!slp_node);
5849       double_reduc = true;
5850       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5851                                             (stmt_info->stmt, 0));
5852       stmt_info = vect_stmt_to_vectorize (stmt_info);
5853     }
5854   gphi *reduc_def_stmt
5855     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5856   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5857   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5858   tree vectype;
5859   machine_mode mode;
5860   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5861   basic_block exit_bb;
5862   tree scalar_dest;
5863   tree scalar_type;
5864   gimple *new_phi = NULL, *phi = NULL;
5865   gimple_stmt_iterator exit_gsi;
5866   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5867   gimple *epilog_stmt = NULL;
5868   gimple *exit_phi;
5869   tree bitsize;
5870   tree def;
5871   tree orig_name, scalar_result;
5872   imm_use_iterator imm_iter, phi_imm_iter;
5873   use_operand_p use_p, phi_use_p;
5874   gimple *use_stmt;
5875   auto_vec<tree> reduc_inputs;
5876   int j, i;
5877   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5878   unsigned int group_size = 1, k;
5879   auto_vec<gimple *> phis;
5880   /* SLP reduction without reduction chain, e.g.,
5881      # a1 = phi <a2, a0>
5882      # b1 = phi <b2, b0>
5883      a2 = operation (a1)
5884      b2 = operation (b1)  */
5885   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5886   bool direct_slp_reduc;
5887   tree induction_index = NULL_TREE;
5888
5889   if (slp_node)
5890     group_size = SLP_TREE_LANES (slp_node);
5891
5892   if (nested_in_vect_loop_p (loop, stmt_info))
5893     {
5894       outer_loop = loop;
5895       loop = loop->inner;
5896       gcc_assert (!slp_node && double_reduc);
5897     }
5898
5899   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5900   gcc_assert (vectype);
5901   mode = TYPE_MODE (vectype);
5902
5903   tree induc_val = NULL_TREE;
5904   tree adjustment_def = NULL;
5905   if (slp_node)
5906     ;
5907   else
5908     {
5909       /* Optimize: for induction condition reduction, if we can't use zero
5910          for induc_val, use initial_def.  */
5911       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5912         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5913       else if (double_reduc)
5914         ;
5915       else
5916         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5917     }
5918
5919   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5920   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5921   if (slp_reduc)
5922     /* All statements produce live-out values.  */
5923     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5924   else if (slp_node)
5925     {
5926       /* The last statement in the reduction chain produces the live-out
5927          value.  Note SLP optimization can shuffle scalar stmts to
5928          optimize permutations so we have to search for the last stmt.  */
5929       for (k = 0; k < group_size; ++k)
5930         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5931           {
5932             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5933             break;
5934           }
5935     }
5936
5937   unsigned vec_num;
5938   int ncopies;
5939   if (slp_node)
5940     {
5941       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5942       ncopies = 1;
5943     }
5944   else
5945     {
5946       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5947       vec_num = 1;
5948       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5949     }
5950
5951   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5952      which is updated with the current index of the loop for every match of
5953      the original loop's cond_expr (VEC_STMT).  This results in a vector
5954      containing the last time the condition passed for that vector lane.
5955      The first match will be a 1 to allow 0 to be used for non-matching
5956      indexes.  If there are no matches at all then the vector will be all
5957      zeroes.
5958
5959      PR92772: This algorithm is broken for architectures that support
5960      masked vectors, but do not provide fold_extract_last.  */
5961   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5962     {
5963       auto_vec<std::pair<tree, bool>, 2> ccompares;
5964       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5965       cond_info = vect_stmt_to_vectorize (cond_info);
5966       while (cond_info != reduc_info)
5967         {
5968           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5969             {
5970               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5971               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5972               ccompares.safe_push
5973                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5974                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5975             }
5976           cond_info
5977             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5978                                                  1 + STMT_VINFO_REDUC_IDX
5979                                                         (cond_info)));
5980           cond_info = vect_stmt_to_vectorize (cond_info);
5981         }
5982       gcc_assert (ccompares.length () != 0);
5983
5984       tree indx_before_incr, indx_after_incr;
5985       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5986       int scalar_precision
5987         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5988       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5989       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5990         (TYPE_MODE (vectype), cr_index_scalar_type,
5991          TYPE_VECTOR_SUBPARTS (vectype));
5992
5993       /* First we create a simple vector induction variable which starts
5994          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5995          vector size (STEP).  */
5996
5997       /* Create a {1,2,3,...} vector.  */
5998       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5999
6000       /* Create a vector of the step value.  */
6001       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6002       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6003
6004       /* Create an induction variable.  */
6005       gimple_stmt_iterator incr_gsi;
6006       bool insert_after;
6007       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6008       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6009                  insert_after, &indx_before_incr, &indx_after_incr);
6010
6011       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6012          filled with zeros (VEC_ZERO).  */
6013
6014       /* Create a vector of 0s.  */
6015       tree zero = build_zero_cst (cr_index_scalar_type);
6016       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6017
6018       /* Create a vector phi node.  */
6019       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6020       new_phi = create_phi_node (new_phi_tree, loop->header);
6021       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6022                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6023
6024       /* Now take the condition from the loops original cond_exprs
6025          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6026          every match uses values from the induction variable
6027          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6028          (NEW_PHI_TREE).
6029          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6030          the new cond_expr (INDEX_COND_EXPR).  */
6031       gimple_seq stmts = NULL;
6032       for (int i = ccompares.length () - 1; i != -1; --i)
6033         {
6034           tree ccompare = ccompares[i].first;
6035           if (ccompares[i].second)
6036             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6037                                          cr_index_vector_type,
6038                                          ccompare,
6039                                          indx_before_incr, new_phi_tree);
6040           else
6041             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6042                                          cr_index_vector_type,
6043                                          ccompare,
6044                                          new_phi_tree, indx_before_incr);
6045         }
6046       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6047
6048       /* Update the phi with the vec cond.  */
6049       induction_index = new_phi_tree;
6050       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6051                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6052     }
6053
6054   /* 2. Create epilog code.
6055         The reduction epilog code operates across the elements of the vector
6056         of partial results computed by the vectorized loop.
6057         The reduction epilog code consists of:
6058
6059         step 1: compute the scalar result in a vector (v_out2)
6060         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6061         step 3: adjust the scalar result (s_out3) if needed.
6062
6063         Step 1 can be accomplished using one the following three schemes:
6064           (scheme 1) using reduc_fn, if available.
6065           (scheme 2) using whole-vector shifts, if available.
6066           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6067                      combined.
6068
6069           The overall epilog code looks like this:
6070
6071           s_out0 = phi <s_loop>         # original EXIT_PHI
6072           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6073           v_out2 = reduce <v_out1>              # step 1
6074           s_out3 = extract_field <v_out2, 0>    # step 2
6075           s_out4 = adjust_result <s_out3>       # step 3
6076
6077           (step 3 is optional, and steps 1 and 2 may be combined).
6078           Lastly, the uses of s_out0 are replaced by s_out4.  */
6079
6080
6081   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6082          v_out1 = phi <VECT_DEF>
6083          Store them in NEW_PHIS.  */
6084   if (double_reduc)
6085     loop = outer_loop;
6086   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6087   exit_gsi = gsi_after_labels (exit_bb);
6088   reduc_inputs.create (slp_node ? vec_num : ncopies);
6089   for (unsigned i = 0; i < vec_num; i++)
6090     {
6091       gimple_seq stmts = NULL;
6092       if (slp_node)
6093         def = vect_get_slp_vect_def (slp_node, i);
6094       else
6095         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6096       for (j = 0; j < ncopies; j++)
6097         {
6098           tree new_def = copy_ssa_name (def);
6099           phi = create_phi_node (new_def, exit_bb);
6100           if (j)
6101             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6102           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6103           new_def = gimple_convert (&stmts, vectype, new_def);
6104           reduc_inputs.quick_push (new_def);
6105         }
6106       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6107     }
6108
6109   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6110          (i.e. when reduc_fn is not available) and in the final adjustment
6111          code (if needed).  Also get the original scalar reduction variable as
6112          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6113          represents a reduction pattern), the tree-code and scalar-def are
6114          taken from the original stmt that the pattern-stmt (STMT) replaces.
6115          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6116          are taken from STMT.  */
6117
6118   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6119   if (orig_stmt_info != stmt_info)
6120     {
6121       /* Reduction pattern  */
6122       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6123       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6124     }
6125
6126   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6127   scalar_type = TREE_TYPE (scalar_dest);
6128   scalar_results.truncate (0);
6129   scalar_results.reserve_exact (group_size);
6130   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6131   bitsize = TYPE_SIZE (scalar_type);
6132
6133   /* True if we should implement SLP_REDUC using native reduction operations
6134      instead of scalar operations.  */
6135   direct_slp_reduc = (reduc_fn != IFN_LAST
6136                       && slp_reduc
6137                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6138
6139   /* In case of reduction chain, e.g.,
6140      # a1 = phi <a3, a0>
6141      a2 = operation (a1)
6142      a3 = operation (a2),
6143
6144      we may end up with more than one vector result.  Here we reduce them
6145      to one vector.
6146
6147      The same is true for a SLP reduction, e.g.,
6148      # a1 = phi <a2, a0>
6149      # b1 = phi <b2, b0>
6150      a2 = operation (a1)
6151      b2 = operation (a2),
6152
6153      where we can end up with more than one vector as well.  We can
6154      easily accumulate vectors when the number of vector elements is
6155      a multiple of the SLP group size.
6156
6157      The same is true if we couldn't use a single defuse cycle.  */
6158   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6159       || direct_slp_reduc
6160       || (slp_reduc
6161           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6162       || ncopies > 1)
6163     {
6164       gimple_seq stmts = NULL;
6165       tree single_input = reduc_inputs[0];
6166       for (k = 1; k < reduc_inputs.length (); k++)
6167         single_input = gimple_build (&stmts, code, vectype,
6168                                      single_input, reduc_inputs[k]);
6169       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6170
6171       reduc_inputs.truncate (0);
6172       reduc_inputs.safe_push (single_input);
6173     }
6174
6175   tree orig_reduc_input = reduc_inputs[0];
6176
6177   /* If this loop is an epilogue loop that can be skipped after the
6178      main loop, we can only share a reduction operation between the
6179      main loop and the epilogue if we put it at the target of the
6180      skip edge.
6181
6182      We can still reuse accumulators if this check fails.  Doing so has
6183      the minor(?) benefit of making the epilogue loop's scalar result
6184      independent of the main loop's scalar result.  */
6185   bool unify_with_main_loop_p = false;
6186   if (reduc_info->reused_accumulator
6187       && loop_vinfo->skip_this_loop_edge
6188       && single_succ_p (exit_bb)
6189       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6190     {
6191       unify_with_main_loop_p = true;
6192
6193       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6194       reduc_inputs[0] = make_ssa_name (vectype);
6195       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6196       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6197                    UNKNOWN_LOCATION);
6198       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6199                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6200       exit_gsi = gsi_after_labels (reduc_block);
6201     }
6202
6203   /* Shouldn't be used beyond this point.  */
6204   exit_bb = nullptr;
6205
6206   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6207       && reduc_fn != IFN_LAST)
6208     {
6209       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6210          various data values where the condition matched and another vector
6211          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6212          need to extract the last matching index (which will be the index with
6213          highest value) and use this to index into the data vector.
6214          For the case where there were no matches, the data vector will contain
6215          all default values and the index vector will be all zeros.  */
6216
6217       /* Get various versions of the type of the vector of indexes.  */
6218       tree index_vec_type = TREE_TYPE (induction_index);
6219       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6220       tree index_scalar_type = TREE_TYPE (index_vec_type);
6221       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6222
6223       /* Get an unsigned integer version of the type of the data vector.  */
6224       int scalar_precision
6225         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6226       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6227       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6228                                                 vectype);
6229
6230       /* First we need to create a vector (ZERO_VEC) of zeros and another
6231          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6232          can create using a MAX reduction and then expanding.
6233          In the case where the loop never made any matches, the max index will
6234          be zero.  */
6235
6236       /* Vector of {0, 0, 0,...}.  */
6237       tree zero_vec = build_zero_cst (vectype);
6238
6239       /* Find maximum value from the vector of found indexes.  */
6240       tree max_index = make_ssa_name (index_scalar_type);
6241       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6242                                                           1, induction_index);
6243       gimple_call_set_lhs (max_index_stmt, max_index);
6244       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6245
6246       /* Vector of {max_index, max_index, max_index,...}.  */
6247       tree max_index_vec = make_ssa_name (index_vec_type);
6248       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6249                                                       max_index);
6250       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6251                                                         max_index_vec_rhs);
6252       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6253
6254       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6255          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6256          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6257          otherwise.  Only one value should match, resulting in a vector
6258          (VEC_COND) with one data value and the rest zeros.
6259          In the case where the loop never made any matches, every index will
6260          match, resulting in a vector with all data values (which will all be
6261          the default value).  */
6262
6263       /* Compare the max index vector to the vector of found indexes to find
6264          the position of the max value.  */
6265       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6266       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6267                                                       induction_index,
6268                                                       max_index_vec);
6269       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6270
6271       /* Use the compare to choose either values from the data vector or
6272          zero.  */
6273       tree vec_cond = make_ssa_name (vectype);
6274       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6275                                                    vec_compare,
6276                                                    reduc_inputs[0],
6277                                                    zero_vec);
6278       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6279
6280       /* Finally we need to extract the data value from the vector (VEC_COND)
6281          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6282          reduction, but because this doesn't exist, we can use a MAX reduction
6283          instead.  The data value might be signed or a float so we need to cast
6284          it first.
6285          In the case where the loop never made any matches, the data values are
6286          all identical, and so will reduce down correctly.  */
6287
6288       /* Make the matched data values unsigned.  */
6289       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6290       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6291                                        vec_cond);
6292       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6293                                                         VIEW_CONVERT_EXPR,
6294                                                         vec_cond_cast_rhs);
6295       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6296
6297       /* Reduce down to a scalar value.  */
6298       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6299       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6300                                                            1, vec_cond_cast);
6301       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6302       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6303
6304       /* Convert the reduced value back to the result type and set as the
6305          result.  */
6306       gimple_seq stmts = NULL;
6307       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6308                                data_reduc);
6309       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6310       scalar_results.safe_push (new_temp);
6311     }
6312   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6313            && reduc_fn == IFN_LAST)
6314     {
6315       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6316          idx = 0;
6317          idx_val = induction_index[0];
6318          val = data_reduc[0];
6319          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6320            if (induction_index[i] > idx_val)
6321              val = data_reduc[i], idx_val = induction_index[i];
6322          return val;  */
6323
6324       tree data_eltype = TREE_TYPE (vectype);
6325       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6326       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6327       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6328       /* Enforced by vectorizable_reduction, which ensures we have target
6329          support before allowing a conditional reduction on variable-length
6330          vectors.  */
6331       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6332       tree idx_val = NULL_TREE, val = NULL_TREE;
6333       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6334         {
6335           tree old_idx_val = idx_val;
6336           tree old_val = val;
6337           idx_val = make_ssa_name (idx_eltype);
6338           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6339                                              build3 (BIT_FIELD_REF, idx_eltype,
6340                                                      induction_index,
6341                                                      bitsize_int (el_size),
6342                                                      bitsize_int (off)));
6343           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6344           val = make_ssa_name (data_eltype);
6345           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6346                                              build3 (BIT_FIELD_REF,
6347                                                      data_eltype,
6348                                                      reduc_inputs[0],
6349                                                      bitsize_int (el_size),
6350                                                      bitsize_int (off)));
6351           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6352           if (off != 0)
6353             {
6354               tree new_idx_val = idx_val;
6355               if (off != v_size - el_size)
6356                 {
6357                   new_idx_val = make_ssa_name (idx_eltype);
6358                   epilog_stmt = gimple_build_assign (new_idx_val,
6359                                                      MAX_EXPR, idx_val,
6360                                                      old_idx_val);
6361                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6362                 }
6363               tree cond = make_ssa_name (boolean_type_node);
6364               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6365                                                  idx_val, old_idx_val);
6366               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6367               tree new_val = make_ssa_name (data_eltype);
6368               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6369                                                  cond, val, old_val);
6370               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6371               idx_val = new_idx_val;
6372               val = new_val;
6373             }
6374         }
6375       /* Convert the reduced value back to the result type and set as the
6376          result.  */
6377       gimple_seq stmts = NULL;
6378       val = gimple_convert (&stmts, scalar_type, val);
6379       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6380       scalar_results.safe_push (val);
6381     }
6382
6383   /* 2.3 Create the reduction code, using one of the three schemes described
6384          above. In SLP we simply need to extract all the elements from the
6385          vector (without reducing them), so we use scalar shifts.  */
6386   else if (reduc_fn != IFN_LAST && !slp_reduc)
6387     {
6388       tree tmp;
6389       tree vec_elem_type;
6390
6391       /* Case 1:  Create:
6392          v_out2 = reduc_expr <v_out1>  */
6393
6394       if (dump_enabled_p ())
6395         dump_printf_loc (MSG_NOTE, vect_location,
6396                          "Reduce using direct vector reduction.\n");
6397
6398       gimple_seq stmts = NULL;
6399       vec_elem_type = TREE_TYPE (vectype);
6400       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6401                                vec_elem_type, reduc_inputs[0]);
6402       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6403       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6404
6405       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6406           && induc_val)
6407         {
6408           /* Earlier we set the initial value to be a vector if induc_val
6409              values.  Check the result and if it is induc_val then replace
6410              with the original initial value, unless induc_val is
6411              the same as initial_def already.  */
6412           tree zcompare = make_ssa_name (boolean_type_node);
6413           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6414                                              new_temp, induc_val);
6415           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6416           tree initial_def = reduc_info->reduc_initial_values[0];
6417           tmp = make_ssa_name (new_scalar_dest);
6418           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6419                                              initial_def, new_temp);
6420           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6421           new_temp = tmp;
6422         }
6423
6424       scalar_results.safe_push (new_temp);
6425     }
6426   else if (direct_slp_reduc)
6427     {
6428       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6429          with the elements for other SLP statements replaced with the
6430          neutral value.  We can then do a normal reduction on each vector.  */
6431
6432       /* Enforced by vectorizable_reduction.  */
6433       gcc_assert (reduc_inputs.length () == 1);
6434       gcc_assert (pow2p_hwi (group_size));
6435
6436       gimple_seq seq = NULL;
6437
6438       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6439          and the same element size as VECTYPE.  */
6440       tree index = build_index_vector (vectype, 0, 1);
6441       tree index_type = TREE_TYPE (index);
6442       tree index_elt_type = TREE_TYPE (index_type);
6443       tree mask_type = truth_type_for (index_type);
6444
6445       /* Create a vector that, for each element, identifies which of
6446          the REDUC_GROUP_SIZE results should use it.  */
6447       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6448       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6449                             build_vector_from_val (index_type, index_mask));
6450
6451       /* Get a neutral vector value.  This is simply a splat of the neutral
6452          scalar value if we have one, otherwise the initial scalar value
6453          is itself a neutral value.  */
6454       tree vector_identity = NULL_TREE;
6455       tree neutral_op = NULL_TREE;
6456       if (slp_node)
6457         {
6458           tree initial_value = NULL_TREE;
6459           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6460             initial_value = reduc_info->reduc_initial_values[0];
6461           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6462                                                  initial_value);
6463         }
6464       if (neutral_op)
6465         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6466                                                         neutral_op);
6467       for (unsigned int i = 0; i < group_size; ++i)
6468         {
6469           /* If there's no univeral neutral value, we can use the
6470              initial scalar value from the original PHI.  This is used
6471              for MIN and MAX reduction, for example.  */
6472           if (!neutral_op)
6473             {
6474               tree scalar_value = reduc_info->reduc_initial_values[i];
6475               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6476                                              scalar_value);
6477               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6478                                                               scalar_value);
6479             }
6480
6481           /* Calculate the equivalent of:
6482
6483              sel[j] = (index[j] == i);
6484
6485              which selects the elements of REDUC_INPUTS[0] that should
6486              be included in the result.  */
6487           tree compare_val = build_int_cst (index_elt_type, i);
6488           compare_val = build_vector_from_val (index_type, compare_val);
6489           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6490                                    index, compare_val);
6491
6492           /* Calculate the equivalent of:
6493
6494              vec = seq ? reduc_inputs[0] : vector_identity;
6495
6496              VEC is now suitable for a full vector reduction.  */
6497           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6498                                    sel, reduc_inputs[0], vector_identity);
6499
6500           /* Do the reduction and convert it to the appropriate type.  */
6501           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6502                                       TREE_TYPE (vectype), vec);
6503           scalar = gimple_convert (&seq, scalar_type, scalar);
6504           scalar_results.safe_push (scalar);
6505         }
6506       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6507     }
6508   else
6509     {
6510       bool reduce_with_shift;
6511       tree vec_temp;
6512
6513       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6514
6515       /* See if the target wants to do the final (shift) reduction
6516          in a vector mode of smaller size and first reduce upper/lower
6517          halves against each other.  */
6518       enum machine_mode mode1 = mode;
6519       tree stype = TREE_TYPE (vectype);
6520       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6521       unsigned nunits1 = nunits;
6522       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6523           && reduc_inputs.length () == 1)
6524         {
6525           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6526           /* For SLP reductions we have to make sure lanes match up, but
6527              since we're doing individual element final reduction reducing
6528              vector width here is even more important.
6529              ???  We can also separate lanes with permutes, for the common
6530              case of power-of-two group-size odd/even extracts would work.  */
6531           if (slp_reduc && nunits != nunits1)
6532             {
6533               nunits1 = least_common_multiple (nunits1, group_size);
6534               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6535             }
6536         }
6537       if (!slp_reduc
6538           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6539         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6540
6541       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6542                                                            stype, nunits1);
6543       reduce_with_shift = have_whole_vector_shift (mode1);
6544       if (!VECTOR_MODE_P (mode1)
6545           || !directly_supported_p (code, vectype1))
6546         reduce_with_shift = false;
6547
6548       /* First reduce the vector to the desired vector size we should
6549          do shift reduction on by combining upper and lower halves.  */
6550       gimple_seq stmts = NULL;
6551       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6552                                              code, &stmts);
6553       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6554       reduc_inputs[0] = new_temp;
6555
6556       if (reduce_with_shift && !slp_reduc)
6557         {
6558           int element_bitsize = tree_to_uhwi (bitsize);
6559           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6560              for variable-length vectors and also requires direct target support
6561              for loop reductions.  */
6562           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6563           int nelements = vec_size_in_bits / element_bitsize;
6564           vec_perm_builder sel;
6565           vec_perm_indices indices;
6566
6567           int elt_offset;
6568
6569           tree zero_vec = build_zero_cst (vectype1);
6570           /* Case 2: Create:
6571              for (offset = nelements/2; offset >= 1; offset/=2)
6572                 {
6573                   Create:  va' = vec_shift <va, offset>
6574                   Create:  va = vop <va, va'>
6575                 }  */
6576
6577           tree rhs;
6578
6579           if (dump_enabled_p ())
6580             dump_printf_loc (MSG_NOTE, vect_location,
6581                              "Reduce using vector shifts\n");
6582
6583           gimple_seq stmts = NULL;
6584           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6585           for (elt_offset = nelements / 2;
6586                elt_offset >= 1;
6587                elt_offset /= 2)
6588             {
6589               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6590               indices.new_vector (sel, 2, nelements);
6591               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6592               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6593                                        new_temp, zero_vec, mask);
6594               new_temp = gimple_build (&stmts, code,
6595                                        vectype1, new_name, new_temp);
6596             }
6597           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6598
6599           /* 2.4  Extract the final scalar result.  Create:
6600              s_out3 = extract_field <v_out2, bitpos>  */
6601
6602           if (dump_enabled_p ())
6603             dump_printf_loc (MSG_NOTE, vect_location,
6604                              "extract scalar result\n");
6605
6606           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6607                         bitsize, bitsize_zero_node);
6608           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6609           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6610           gimple_assign_set_lhs (epilog_stmt, new_temp);
6611           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6612           scalar_results.safe_push (new_temp);
6613         }
6614       else
6615         {
6616           /* Case 3: Create:
6617              s = extract_field <v_out2, 0>
6618              for (offset = element_size;
6619                   offset < vector_size;
6620                   offset += element_size;)
6621                {
6622                  Create:  s' = extract_field <v_out2, offset>
6623                  Create:  s = op <s, s'>  // For non SLP cases
6624                }  */
6625
6626           if (dump_enabled_p ())
6627             dump_printf_loc (MSG_NOTE, vect_location,
6628                              "Reduce using scalar code.\n");
6629
6630           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6631           int element_bitsize = tree_to_uhwi (bitsize);
6632           tree compute_type = TREE_TYPE (vectype);
6633           gimple_seq stmts = NULL;
6634           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6635             {
6636               int bit_offset;
6637               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6638                                        vec_temp, bitsize, bitsize_zero_node);
6639
6640               /* In SLP we don't need to apply reduction operation, so we just
6641                  collect s' values in SCALAR_RESULTS.  */
6642               if (slp_reduc)
6643                 scalar_results.safe_push (new_temp);
6644
6645               for (bit_offset = element_bitsize;
6646                    bit_offset < vec_size_in_bits;
6647                    bit_offset += element_bitsize)
6648                 {
6649                   tree bitpos = bitsize_int (bit_offset);
6650                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6651                                            compute_type, vec_temp,
6652                                            bitsize, bitpos);
6653                   if (slp_reduc)
6654                     {
6655                       /* In SLP we don't need to apply reduction operation, so
6656                          we just collect s' values in SCALAR_RESULTS.  */
6657                       new_temp = new_name;
6658                       scalar_results.safe_push (new_name);
6659                     }
6660                   else
6661                     new_temp = gimple_build (&stmts, code, compute_type,
6662                                              new_name, new_temp);
6663                 }
6664             }
6665
6666           /* The only case where we need to reduce scalar results in SLP, is
6667              unrolling.  If the size of SCALAR_RESULTS is greater than
6668              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6669              REDUC_GROUP_SIZE.  */
6670           if (slp_reduc)
6671             {
6672               tree res, first_res, new_res;
6673
6674               /* Reduce multiple scalar results in case of SLP unrolling.  */
6675               for (j = group_size; scalar_results.iterate (j, &res);
6676                    j++)
6677                 {
6678                   first_res = scalar_results[j % group_size];
6679                   new_res = gimple_build (&stmts, code, compute_type,
6680                                           first_res, res);
6681                   scalar_results[j % group_size] = new_res;
6682                 }
6683               scalar_results.truncate (group_size);
6684               for (k = 0; k < group_size; k++)
6685                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6686                                                     scalar_results[k]);
6687             }
6688           else
6689             {
6690               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6691               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6692               scalar_results.safe_push (new_temp);
6693             }
6694
6695           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6696         }
6697
6698       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6699           && induc_val)
6700         {
6701           /* Earlier we set the initial value to be a vector if induc_val
6702              values.  Check the result and if it is induc_val then replace
6703              with the original initial value, unless induc_val is
6704              the same as initial_def already.  */
6705           tree zcompare = make_ssa_name (boolean_type_node);
6706           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6707                                              induc_val);
6708           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6709           tree initial_def = reduc_info->reduc_initial_values[0];
6710           tree tmp = make_ssa_name (new_scalar_dest);
6711           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6712                                              initial_def, new_temp);
6713           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6714           scalar_results[0] = tmp;
6715         }
6716     }
6717
6718   /* 2.5 Adjust the final result by the initial value of the reduction
6719          variable. (When such adjustment is not needed, then
6720          'adjustment_def' is zero).  For example, if code is PLUS we create:
6721          new_temp = loop_exit_def + adjustment_def  */
6722
6723   if (adjustment_def)
6724     {
6725       gcc_assert (!slp_reduc);
6726       gimple_seq stmts = NULL;
6727       if (double_reduc)
6728         {
6729           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6730           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6731           new_temp = gimple_build (&stmts, code, vectype,
6732                                    reduc_inputs[0], adjustment_def);
6733         }
6734       else
6735         {
6736           new_temp = scalar_results[0];
6737           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6738           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6739                                            adjustment_def);
6740           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6741           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6742                                    new_temp, adjustment_def);
6743           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6744         }
6745
6746       epilog_stmt = gimple_seq_last_stmt (stmts);
6747       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6748       scalar_results[0] = new_temp;
6749     }
6750
6751   /* Record this operation if it could be reused by the epilogue loop.  */
6752   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6753       && reduc_inputs.length () == 1)
6754     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6755                                            { orig_reduc_input, reduc_info });
6756
6757   if (double_reduc)
6758     loop = outer_loop;
6759
6760   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6761           phis with new adjusted scalar results, i.e., replace use <s_out0>
6762           with use <s_out4>.
6763
6764      Transform:
6765         loop_exit:
6766           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6767           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6768           v_out2 = reduce <v_out1>
6769           s_out3 = extract_field <v_out2, 0>
6770           s_out4 = adjust_result <s_out3>
6771           use <s_out0>
6772           use <s_out0>
6773
6774      into:
6775
6776         loop_exit:
6777           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6778           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6779           v_out2 = reduce <v_out1>
6780           s_out3 = extract_field <v_out2, 0>
6781           s_out4 = adjust_result <s_out3>
6782           use <s_out4>
6783           use <s_out4> */
6784
6785   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6786   for (k = 0; k < live_out_stmts.size (); k++)
6787     {
6788       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6789       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6790
6791       phis.create (3);
6792       /* Find the loop-closed-use at the loop exit of the original scalar
6793          result.  (The reduction result is expected to have two immediate uses,
6794          one at the latch block, and one at the loop exit).  For double
6795          reductions we are looking for exit phis of the outer loop.  */
6796       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6797         {
6798           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6799             {
6800               if (!is_gimple_debug (USE_STMT (use_p)))
6801                 phis.safe_push (USE_STMT (use_p));
6802             }
6803           else
6804             {
6805               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6806                 {
6807                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6808
6809                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6810                     {
6811                       if (!flow_bb_inside_loop_p (loop,
6812                                              gimple_bb (USE_STMT (phi_use_p)))
6813                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6814                         phis.safe_push (USE_STMT (phi_use_p));
6815                     }
6816                 }
6817             }
6818         }
6819
6820       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6821         {
6822           /* Replace the uses:  */
6823           orig_name = PHI_RESULT (exit_phi);
6824
6825           /* Look for a single use at the target of the skip edge.  */
6826           if (unify_with_main_loop_p)
6827             {
6828               use_operand_p use_p;
6829               gimple *user;
6830               if (!single_imm_use (orig_name, &use_p, &user))
6831                 gcc_unreachable ();
6832               orig_name = gimple_get_lhs (user);
6833             }
6834
6835           scalar_result = scalar_results[k];
6836           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6837             {
6838               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6839                 SET_USE (use_p, scalar_result);
6840               update_stmt (use_stmt);
6841             }
6842         }
6843
6844       phis.release ();
6845     }
6846 }
6847
6848 /* Return a vector of type VECTYPE that is equal to the vector select
6849    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6850    before GSI.  */
6851
6852 static tree
6853 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6854                      tree vec, tree identity)
6855 {
6856   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6857   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6858                                           mask, vec, identity);
6859   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6860   return cond;
6861 }
6862
6863 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6864    order, starting with LHS.  Insert the extraction statements before GSI and
6865    associate the new scalar SSA names with variable SCALAR_DEST.
6866    Return the SSA name for the result.  */
6867
6868 static tree
6869 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6870                        tree_code code, tree lhs, tree vector_rhs)
6871 {
6872   tree vectype = TREE_TYPE (vector_rhs);
6873   tree scalar_type = TREE_TYPE (vectype);
6874   tree bitsize = TYPE_SIZE (scalar_type);
6875   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6876   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6877
6878   for (unsigned HOST_WIDE_INT bit_offset = 0;
6879        bit_offset < vec_size_in_bits;
6880        bit_offset += element_bitsize)
6881     {
6882       tree bitpos = bitsize_int (bit_offset);
6883       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6884                          bitsize, bitpos);
6885
6886       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6887       rhs = make_ssa_name (scalar_dest, stmt);
6888       gimple_assign_set_lhs (stmt, rhs);
6889       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6890
6891       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6892       tree new_name = make_ssa_name (scalar_dest, stmt);
6893       gimple_assign_set_lhs (stmt, new_name);
6894       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6895       lhs = new_name;
6896     }
6897   return lhs;
6898 }
6899
6900 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6901    type of the vector input.  */
6902
6903 static internal_fn
6904 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6905 {
6906   internal_fn mask_reduc_fn;
6907   internal_fn mask_len_reduc_fn;
6908
6909   switch (reduc_fn)
6910     {
6911     case IFN_FOLD_LEFT_PLUS:
6912       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6913       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6914       break;
6915
6916     default:
6917       return IFN_LAST;
6918     }
6919
6920   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6921                                       OPTIMIZE_FOR_SPEED))
6922     return mask_reduc_fn;
6923   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6924                                       OPTIMIZE_FOR_SPEED))
6925     return mask_len_reduc_fn;
6926   return IFN_LAST;
6927 }
6928
6929 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6930    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6931    statement.  CODE is the operation performed by STMT_INFO and OPS are
6932    its scalar operands.  REDUC_INDEX is the index of the operand in
6933    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6934    implements in-order reduction, or IFN_LAST if we should open-code it.
6935    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6936    that should be used to control the operation in a fully-masked loop.  */
6937
6938 static bool
6939 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6940                                stmt_vec_info stmt_info,
6941                                gimple_stmt_iterator *gsi,
6942                                gimple **vec_stmt, slp_tree slp_node,
6943                                gimple *reduc_def_stmt,
6944                                tree_code code, internal_fn reduc_fn,
6945                                tree ops[3], tree vectype_in,
6946                                int reduc_index, vec_loop_masks *masks,
6947                                vec_loop_lens *lens)
6948 {
6949   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6950   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6951   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6952
6953   int ncopies;
6954   if (slp_node)
6955     ncopies = 1;
6956   else
6957     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6958
6959   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6960   gcc_assert (ncopies == 1);
6961   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6962
6963   if (slp_node)
6964     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6965                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6966
6967   tree op0 = ops[1 - reduc_index];
6968
6969   int group_size = 1;
6970   stmt_vec_info scalar_dest_def_info;
6971   auto_vec<tree> vec_oprnds0;
6972   if (slp_node)
6973     {
6974       auto_vec<vec<tree> > vec_defs (2);
6975       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6976       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6977       vec_defs[0].release ();
6978       vec_defs[1].release ();
6979       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6980       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6981     }
6982   else
6983     {
6984       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6985                                      op0, &vec_oprnds0);
6986       scalar_dest_def_info = stmt_info;
6987     }
6988
6989   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6990   tree scalar_type = TREE_TYPE (scalar_dest);
6991   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6992
6993   int vec_num = vec_oprnds0.length ();
6994   gcc_assert (vec_num == 1 || slp_node);
6995   tree vec_elem_type = TREE_TYPE (vectype_out);
6996   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6997
6998   tree vector_identity = NULL_TREE;
6999   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7000     {
7001       vector_identity = build_zero_cst (vectype_out);
7002       if (!HONOR_SIGNED_ZEROS (vectype_out))
7003         ;
7004       else
7005         {
7006           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7007           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7008                                         vector_identity);
7009         }
7010     }
7011
7012   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7013   int i;
7014   tree def0;
7015   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7016     {
7017       gimple *new_stmt;
7018       tree mask = NULL_TREE;
7019       tree len = NULL_TREE;
7020       tree bias = NULL_TREE;
7021       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7022         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7023       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7024         {
7025           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7026                                    i, 1);
7027           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7028           bias = build_int_cst (intQI_type_node, biasval);
7029           mask = build_minus_one_cst (truth_type_for (vectype_in));
7030         }
7031
7032       /* Handle MINUS by adding the negative.  */
7033       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7034         {
7035           tree negated = make_ssa_name (vectype_out);
7036           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7037           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7038           def0 = negated;
7039         }
7040
7041       if (mask && mask_reduc_fn == IFN_LAST)
7042         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7043                                     vector_identity);
7044
7045       /* On the first iteration the input is simply the scalar phi
7046          result, and for subsequent iterations it is the output of
7047          the preceding operation.  */
7048       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7049         {
7050           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7051             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7052                                                    def0, mask, len, bias);
7053           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7054             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7055                                                    def0, mask);
7056           else
7057             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7058                                                    def0);
7059           /* For chained SLP reductions the output of the previous reduction
7060              operation serves as the input of the next. For the final statement
7061              the output cannot be a temporary - we reuse the original
7062              scalar destination of the last statement.  */
7063           if (i != vec_num - 1)
7064             {
7065               gimple_set_lhs (new_stmt, scalar_dest_var);
7066               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7067               gimple_set_lhs (new_stmt, reduc_var);
7068             }
7069         }
7070       else
7071         {
7072           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
7073                                              reduc_var, def0);
7074           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7075           /* Remove the statement, so that we can use the same code paths
7076              as for statements that we've just created.  */
7077           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7078           gsi_remove (&tmp_gsi, true);
7079         }
7080
7081       if (i == vec_num - 1)
7082         {
7083           gimple_set_lhs (new_stmt, scalar_dest);
7084           vect_finish_replace_stmt (loop_vinfo,
7085                                     scalar_dest_def_info,
7086                                     new_stmt);
7087         }
7088       else
7089         vect_finish_stmt_generation (loop_vinfo,
7090                                      scalar_dest_def_info,
7091                                      new_stmt, gsi);
7092
7093       if (slp_node)
7094         slp_node->push_vec_def (new_stmt);
7095       else
7096         {
7097           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7098           *vec_stmt = new_stmt;
7099         }
7100     }
7101
7102   return true;
7103 }
7104
7105 /* Function is_nonwrapping_integer_induction.
7106
7107    Check if STMT_VINO (which is part of loop LOOP) both increments and
7108    does not cause overflow.  */
7109
7110 static bool
7111 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7112 {
7113   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7114   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7115   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7116   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7117   widest_int ni, max_loop_value, lhs_max;
7118   wi::overflow_type overflow = wi::OVF_NONE;
7119
7120   /* Make sure the loop is integer based.  */
7121   if (TREE_CODE (base) != INTEGER_CST
7122       || TREE_CODE (step) != INTEGER_CST)
7123     return false;
7124
7125   /* Check that the max size of the loop will not wrap.  */
7126
7127   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7128     return true;
7129
7130   if (! max_stmt_executions (loop, &ni))
7131     return false;
7132
7133   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7134                             &overflow);
7135   if (overflow)
7136     return false;
7137
7138   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7139                             TYPE_SIGN (lhs_type), &overflow);
7140   if (overflow)
7141     return false;
7142
7143   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7144           <= TYPE_PRECISION (lhs_type));
7145 }
7146
7147 /* Check if masking can be supported by inserting a conditional expression.
7148    CODE is the code for the operation.  COND_FN is the conditional internal
7149    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7150 static bool
7151 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7152                          tree vectype_in)
7153 {
7154   if (cond_fn != IFN_LAST
7155       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7156                                          OPTIMIZE_FOR_SPEED))
7157     return false;
7158
7159   if (code.is_tree_code ())
7160     switch (tree_code (code))
7161       {
7162       case DOT_PROD_EXPR:
7163       case SAD_EXPR:
7164         return true;
7165
7166       default:
7167         break;
7168       }
7169   return false;
7170 }
7171
7172 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7173    code for the operation.  VOP is the array of operands.  MASK is the loop
7174    mask.  GSI is a statement iterator used to place the new conditional
7175    expression.  */
7176 static void
7177 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7178                       gimple_stmt_iterator *gsi)
7179 {
7180   switch (tree_code (code))
7181     {
7182     case DOT_PROD_EXPR:
7183       {
7184         tree vectype = TREE_TYPE (vop[1]);
7185         tree zero = build_zero_cst (vectype);
7186         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7187         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7188                                                mask, vop[1], zero);
7189         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7190         vop[1] = masked_op1;
7191         break;
7192       }
7193
7194     case SAD_EXPR:
7195       {
7196         tree vectype = TREE_TYPE (vop[1]);
7197         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7198         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7199                                                mask, vop[1], vop[0]);
7200         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7201         vop[1] = masked_op1;
7202         break;
7203       }
7204
7205     default:
7206       gcc_unreachable ();
7207     }
7208 }
7209
7210 /* Function vectorizable_reduction.
7211
7212    Check if STMT_INFO performs a reduction operation that can be vectorized.
7213    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7214    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7215    Return true if STMT_INFO is vectorizable in this way.
7216
7217    This function also handles reduction idioms (patterns) that have been
7218    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7219    may be of this form:
7220      X = pattern_expr (arg0, arg1, ..., X)
7221    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7222    sequence that had been detected and replaced by the pattern-stmt
7223    (STMT_INFO).
7224
7225    This function also handles reduction of condition expressions, for example:
7226      for (int i = 0; i < N; i++)
7227        if (a[i] < value)
7228          last = a[i];
7229    This is handled by vectorising the loop and creating an additional vector
7230    containing the loop indexes for which "a[i] < value" was true.  In the
7231    function epilogue this is reduced to a single max value and then used to
7232    index into the vector of results.
7233
7234    In some cases of reduction patterns, the type of the reduction variable X is
7235    different than the type of the other arguments of STMT_INFO.
7236    In such cases, the vectype that is used when transforming STMT_INFO into
7237    a vector stmt is different than the vectype that is used to determine the
7238    vectorization factor, because it consists of a different number of elements
7239    than the actual number of elements that are being operated upon in parallel.
7240
7241    For example, consider an accumulation of shorts into an int accumulator.
7242    On some targets it's possible to vectorize this pattern operating on 8
7243    shorts at a time (hence, the vectype for purposes of determining the
7244    vectorization factor should be V8HI); on the other hand, the vectype that
7245    is used to create the vector form is actually V4SI (the type of the result).
7246
7247    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7248    indicates what is the actual level of parallelism (V8HI in the example), so
7249    that the right vectorization factor would be derived.  This vectype
7250    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7251    be used to create the vectorized stmt.  The right vectype for the vectorized
7252    stmt is obtained from the type of the result X:
7253       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7254
7255    This means that, contrary to "regular" reductions (or "regular" stmts in
7256    general), the following equation:
7257       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7258    does *NOT* necessarily hold for reduction patterns.  */
7259
7260 bool
7261 vectorizable_reduction (loop_vec_info loop_vinfo,
7262                         stmt_vec_info stmt_info, slp_tree slp_node,
7263                         slp_instance slp_node_instance,
7264                         stmt_vector_for_cost *cost_vec)
7265 {
7266   tree vectype_in = NULL_TREE;
7267   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7268   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7269   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7270   stmt_vec_info cond_stmt_vinfo = NULL;
7271   int i;
7272   int ncopies;
7273   bool single_defuse_cycle = false;
7274   bool nested_cycle = false;
7275   bool double_reduc = false;
7276   int vec_num;
7277   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7278   tree cond_reduc_val = NULL_TREE;
7279
7280   /* Make sure it was already recognized as a reduction computation.  */
7281   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7282       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7283       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7284     return false;
7285
7286   /* The stmt we store reduction analysis meta on.  */
7287   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7288   reduc_info->is_reduc_info = true;
7289
7290   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7291     {
7292       if (is_a <gphi *> (stmt_info->stmt))
7293         {
7294           if (slp_node)
7295             {
7296               /* We eventually need to set a vector type on invariant
7297                  arguments.  */
7298               unsigned j;
7299               slp_tree child;
7300               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7301                 if (!vect_maybe_update_slp_op_vectype
7302                        (child, SLP_TREE_VECTYPE (slp_node)))
7303                   {
7304                     if (dump_enabled_p ())
7305                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7306                                        "incompatible vector types for "
7307                                        "invariants\n");
7308                     return false;
7309                   }
7310             }
7311           /* Analysis for double-reduction is done on the outer
7312              loop PHI, nested cycles have no further restrictions.  */
7313           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7314         }
7315       else
7316         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7317       return true;
7318     }
7319
7320   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7321   stmt_vec_info phi_info = stmt_info;
7322   if (!is_a <gphi *> (stmt_info->stmt))
7323     {
7324       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7325       return true;
7326     }
7327   if (slp_node)
7328     {
7329       slp_node_instance->reduc_phis = slp_node;
7330       /* ???  We're leaving slp_node to point to the PHIs, we only
7331          need it to get at the number of vector stmts which wasn't
7332          yet initialized for the instance root.  */
7333     }
7334   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7335     {
7336       use_operand_p use_p;
7337       gimple *use_stmt;
7338       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7339                                  &use_p, &use_stmt);
7340       gcc_assert (res);
7341       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7342     }
7343
7344   /* PHIs should not participate in patterns.  */
7345   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7346   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7347
7348   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7349      and compute the reduction chain length.  Discover the real
7350      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7351   tree reduc_def
7352     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7353                              loop_latch_edge
7354                                (gimple_bb (reduc_def_phi)->loop_father));
7355   unsigned reduc_chain_length = 0;
7356   bool only_slp_reduc_chain = true;
7357   stmt_info = NULL;
7358   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7359   while (reduc_def != PHI_RESULT (reduc_def_phi))
7360     {
7361       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7362       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7363       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7364         {
7365           if (dump_enabled_p ())
7366             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7367                              "reduction chain broken by patterns.\n");
7368           return false;
7369         }
7370       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7371         only_slp_reduc_chain = false;
7372       /* For epilogue generation live members of the chain need
7373          to point back to the PHI via their original stmt for
7374          info_for_reduction to work.  For SLP we need to look at
7375          all lanes here - even though we only will vectorize from
7376          the SLP node with live lane zero the other live lanes also
7377          need to be identified as part of a reduction to be able
7378          to skip code generation for them.  */
7379       if (slp_for_stmt_info)
7380         {
7381           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7382             if (STMT_VINFO_LIVE_P (s))
7383               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7384         }
7385       else if (STMT_VINFO_LIVE_P (vdef))
7386         STMT_VINFO_REDUC_DEF (def) = phi_info;
7387       gimple_match_op op;
7388       if (!gimple_extract_op (vdef->stmt, &op))
7389         {
7390           if (dump_enabled_p ())
7391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392                              "reduction chain includes unsupported"
7393                              " statement type.\n");
7394           return false;
7395         }
7396       if (CONVERT_EXPR_CODE_P (op.code))
7397         {
7398           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7399             {
7400               if (dump_enabled_p ())
7401                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7402                                  "conversion in the reduction chain.\n");
7403               return false;
7404             }
7405         }
7406       else if (!stmt_info)
7407         /* First non-conversion stmt.  */
7408         stmt_info = vdef;
7409       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7410       reduc_chain_length++;
7411       if (!stmt_info && slp_node)
7412         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7413     }
7414   /* PHIs should not participate in patterns.  */
7415   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7416
7417   if (nested_in_vect_loop_p (loop, stmt_info))
7418     {
7419       loop = loop->inner;
7420       nested_cycle = true;
7421     }
7422
7423   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7424      element.  */
7425   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7426     {
7427       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7428       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7429     }
7430   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7431     gcc_assert (slp_node
7432                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7433
7434   /* 1. Is vectorizable reduction?  */
7435   /* Not supportable if the reduction variable is used in the loop, unless
7436      it's a reduction chain.  */
7437   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7438       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7439     return false;
7440
7441   /* Reductions that are not used even in an enclosing outer-loop,
7442      are expected to be "live" (used out of the loop).  */
7443   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7444       && !STMT_VINFO_LIVE_P (stmt_info))
7445     return false;
7446
7447   /* 2. Has this been recognized as a reduction pattern?
7448
7449      Check if STMT represents a pattern that has been recognized
7450      in earlier analysis stages.  For stmts that represent a pattern,
7451      the STMT_VINFO_RELATED_STMT field records the last stmt in
7452      the original sequence that constitutes the pattern.  */
7453
7454   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7455   if (orig_stmt_info)
7456     {
7457       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7458       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7459     }
7460
7461   /* 3. Check the operands of the operation.  The first operands are defined
7462         inside the loop body. The last operand is the reduction variable,
7463         which is defined by the loop-header-phi.  */
7464
7465   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7466   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7467   gimple_match_op op;
7468   if (!gimple_extract_op (stmt_info->stmt, &op))
7469     gcc_unreachable ();
7470   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7471                             || op.code == WIDEN_SUM_EXPR
7472                             || op.code == SAD_EXPR);
7473
7474   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7475       && !SCALAR_FLOAT_TYPE_P (op.type))
7476     return false;
7477
7478   /* Do not try to vectorize bit-precision reductions.  */
7479   if (!type_has_mode_precision_p (op.type))
7480     return false;
7481
7482   /* For lane-reducing ops we're reducing the number of reduction PHIs
7483      which means the only use of that may be in the lane-reducing operation.  */
7484   if (lane_reduc_code_p
7485       && reduc_chain_length != 1
7486       && !only_slp_reduc_chain)
7487     {
7488       if (dump_enabled_p ())
7489         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7490                          "lane-reducing reduction with extra stmts.\n");
7491       return false;
7492     }
7493
7494   /* All uses but the last are expected to be defined in the loop.
7495      The last use is the reduction variable.  In case of nested cycle this
7496      assumption is not true: we use reduc_index to record the index of the
7497      reduction variable.  */
7498   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7499   /* We need to skip an extra operand for COND_EXPRs with embedded
7500      comparison.  */
7501   unsigned opno_adjust = 0;
7502   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7503     opno_adjust = 1;
7504   for (i = 0; i < (int) op.num_ops; i++)
7505     {
7506       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7507       if (i == 0 && op.code == COND_EXPR)
7508         continue;
7509
7510       stmt_vec_info def_stmt_info;
7511       enum vect_def_type dt;
7512       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7513                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7514                                &vectype_op[i], &def_stmt_info))
7515         {
7516           if (dump_enabled_p ())
7517             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7518                              "use not simple.\n");
7519           return false;
7520         }
7521       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7522         continue;
7523
7524       /* There should be only one cycle def in the stmt, the one
7525          leading to reduc_def.  */
7526       if (VECTORIZABLE_CYCLE_DEF (dt))
7527         return false;
7528
7529       if (!vectype_op[i])
7530         vectype_op[i]
7531           = get_vectype_for_scalar_type (loop_vinfo,
7532                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7533
7534       /* To properly compute ncopies we are interested in the widest
7535          non-reduction input type in case we're looking at a widening
7536          accumulation that we later handle in vect_transform_reduction.  */
7537       if (lane_reduc_code_p
7538           && vectype_op[i]
7539           && (!vectype_in
7540               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7541                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7542         vectype_in = vectype_op[i];
7543
7544       if (op.code == COND_EXPR)
7545         {
7546           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7547           if (dt == vect_constant_def)
7548             {
7549               cond_reduc_dt = dt;
7550               cond_reduc_val = op.ops[i];
7551             }
7552           if (dt == vect_induction_def
7553               && def_stmt_info
7554               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7555             {
7556               cond_reduc_dt = dt;
7557               cond_stmt_vinfo = def_stmt_info;
7558             }
7559         }
7560     }
7561   if (!vectype_in)
7562     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7563   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7564
7565   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7566   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7567   /* If we have a condition reduction, see if we can simplify it further.  */
7568   if (v_reduc_type == COND_REDUCTION)
7569     {
7570       if (slp_node)
7571         return false;
7572
7573       /* When the condition uses the reduction value in the condition, fail.  */
7574       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7575         {
7576           if (dump_enabled_p ())
7577             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7578                              "condition depends on previous iteration\n");
7579           return false;
7580         }
7581
7582       if (reduc_chain_length == 1
7583           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7584                                               OPTIMIZE_FOR_SPEED)
7585               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7586                                                  vectype_in,
7587                                                  OPTIMIZE_FOR_SPEED)))
7588         {
7589           if (dump_enabled_p ())
7590             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7591                              "optimizing condition reduction with"
7592                              " FOLD_EXTRACT_LAST.\n");
7593           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7594         }
7595       else if (cond_reduc_dt == vect_induction_def)
7596         {
7597           tree base
7598             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7599           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7600
7601           gcc_assert (TREE_CODE (base) == INTEGER_CST
7602                       && TREE_CODE (step) == INTEGER_CST);
7603           cond_reduc_val = NULL_TREE;
7604           enum tree_code cond_reduc_op_code = ERROR_MARK;
7605           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7606           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7607             ;
7608           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7609              above base; punt if base is the minimum value of the type for
7610              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7611           else if (tree_int_cst_sgn (step) == -1)
7612             {
7613               cond_reduc_op_code = MIN_EXPR;
7614               if (tree_int_cst_sgn (base) == -1)
7615                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7616               else if (tree_int_cst_lt (base,
7617                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7618                 cond_reduc_val
7619                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7620             }
7621           else
7622             {
7623               cond_reduc_op_code = MAX_EXPR;
7624               if (tree_int_cst_sgn (base) == 1)
7625                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7626               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7627                                         base))
7628                 cond_reduc_val
7629                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7630             }
7631           if (cond_reduc_val)
7632             {
7633               if (dump_enabled_p ())
7634                 dump_printf_loc (MSG_NOTE, vect_location,
7635                                  "condition expression based on "
7636                                  "integer induction.\n");
7637               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7638               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7639                 = cond_reduc_val;
7640               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7641             }
7642         }
7643       else if (cond_reduc_dt == vect_constant_def)
7644         {
7645           enum vect_def_type cond_initial_dt;
7646           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7647           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7648           if (cond_initial_dt == vect_constant_def
7649               && types_compatible_p (TREE_TYPE (cond_initial_val),
7650                                      TREE_TYPE (cond_reduc_val)))
7651             {
7652               tree e = fold_binary (LE_EXPR, boolean_type_node,
7653                                     cond_initial_val, cond_reduc_val);
7654               if (e && (integer_onep (e) || integer_zerop (e)))
7655                 {
7656                   if (dump_enabled_p ())
7657                     dump_printf_loc (MSG_NOTE, vect_location,
7658                                      "condition expression based on "
7659                                      "compile time constant.\n");
7660                   /* Record reduction code at analysis stage.  */
7661                   STMT_VINFO_REDUC_CODE (reduc_info)
7662                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7663                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7664                 }
7665             }
7666         }
7667     }
7668
7669   if (STMT_VINFO_LIVE_P (phi_info))
7670     return false;
7671
7672   if (slp_node)
7673     ncopies = 1;
7674   else
7675     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7676
7677   gcc_assert (ncopies >= 1);
7678
7679   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7680
7681   if (nested_cycle)
7682     {
7683       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7684                   == vect_double_reduction_def);
7685       double_reduc = true;
7686     }
7687
7688   /* 4.2. Check support for the epilog operation.
7689
7690           If STMT represents a reduction pattern, then the type of the
7691           reduction variable may be different than the type of the rest
7692           of the arguments.  For example, consider the case of accumulation
7693           of shorts into an int accumulator; The original code:
7694                         S1: int_a = (int) short_a;
7695           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7696
7697           was replaced with:
7698                         STMT: int_acc = widen_sum <short_a, int_acc>
7699
7700           This means that:
7701           1. The tree-code that is used to create the vector operation in the
7702              epilog code (that reduces the partial results) is not the
7703              tree-code of STMT, but is rather the tree-code of the original
7704              stmt from the pattern that STMT is replacing.  I.e, in the example
7705              above we want to use 'widen_sum' in the loop, but 'plus' in the
7706              epilog.
7707           2. The type (mode) we use to check available target support
7708              for the vector operation to be created in the *epilog*, is
7709              determined by the type of the reduction variable (in the example
7710              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7711              However the type (mode) we use to check available target support
7712              for the vector operation to be created *inside the loop*, is
7713              determined by the type of the other arguments to STMT (in the
7714              example we'd check this: optab_handler (widen_sum_optab,
7715              vect_short_mode)).
7716
7717           This is contrary to "regular" reductions, in which the types of all
7718           the arguments are the same as the type of the reduction variable.
7719           For "regular" reductions we can therefore use the same vector type
7720           (and also the same tree-code) when generating the epilog code and
7721           when generating the code inside the loop.  */
7722
7723   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7724   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7725
7726   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7727   if (reduction_type == TREE_CODE_REDUCTION)
7728     {
7729       /* Check whether it's ok to change the order of the computation.
7730          Generally, when vectorizing a reduction we change the order of the
7731          computation.  This may change the behavior of the program in some
7732          cases, so we need to check that this is ok.  One exception is when
7733          vectorizing an outer-loop: the inner-loop is executed sequentially,
7734          and therefore vectorizing reductions in the inner-loop during
7735          outer-loop vectorization is safe.  Likewise when we are vectorizing
7736          a series of reductions using SLP and the VF is one the reductions
7737          are performed in scalar order.  */
7738       if (slp_node
7739           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7740           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7741         ;
7742       else if (needs_fold_left_reduction_p (op.type, orig_code))
7743         {
7744           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7745              is not directy used in stmt.  */
7746           if (!only_slp_reduc_chain
7747               && reduc_chain_length != 1)
7748             {
7749               if (dump_enabled_p ())
7750                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7751                                  "in-order reduction chain without SLP.\n");
7752               return false;
7753             }
7754           STMT_VINFO_REDUC_TYPE (reduc_info)
7755             = reduction_type = FOLD_LEFT_REDUCTION;
7756         }
7757       else if (!commutative_binary_op_p (orig_code, op.type)
7758                || !associative_binary_op_p (orig_code, op.type))
7759         {
7760           if (dump_enabled_p ())
7761             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7762                             "reduction: not commutative/associative");
7763           return false;
7764         }
7765     }
7766
7767   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7768       && ncopies > 1)
7769     {
7770       if (dump_enabled_p ())
7771         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7772                          "multiple types in double reduction or condition "
7773                          "reduction or fold-left reduction.\n");
7774       return false;
7775     }
7776
7777   internal_fn reduc_fn = IFN_LAST;
7778   if (reduction_type == TREE_CODE_REDUCTION
7779       || reduction_type == FOLD_LEFT_REDUCTION
7780       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7781       || reduction_type == CONST_COND_REDUCTION)
7782     {
7783       if (reduction_type == FOLD_LEFT_REDUCTION
7784           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7785           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7786         {
7787           if (reduc_fn != IFN_LAST
7788               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7789                                                   OPTIMIZE_FOR_SPEED))
7790             {
7791               if (dump_enabled_p ())
7792                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7793                                  "reduc op not supported by target.\n");
7794
7795               reduc_fn = IFN_LAST;
7796             }
7797         }
7798       else
7799         {
7800           if (!nested_cycle || double_reduc)
7801             {
7802               if (dump_enabled_p ())
7803                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7804                                  "no reduc code for scalar code.\n");
7805
7806               return false;
7807             }
7808         }
7809     }
7810   else if (reduction_type == COND_REDUCTION)
7811     {
7812       int scalar_precision
7813         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7814       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7815       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7816                                                 vectype_out);
7817
7818       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7819                                           OPTIMIZE_FOR_SPEED))
7820         reduc_fn = IFN_REDUC_MAX;
7821     }
7822   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7823
7824   if (reduction_type != EXTRACT_LAST_REDUCTION
7825       && (!nested_cycle || double_reduc)
7826       && reduc_fn == IFN_LAST
7827       && !nunits_out.is_constant ())
7828     {
7829       if (dump_enabled_p ())
7830         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7831                          "missing target support for reduction on"
7832                          " variable-length vectors.\n");
7833       return false;
7834     }
7835
7836   /* For SLP reductions, see if there is a neutral value we can use.  */
7837   tree neutral_op = NULL_TREE;
7838   if (slp_node)
7839     {
7840       tree initial_value = NULL_TREE;
7841       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7842         initial_value = vect_phi_initial_value (reduc_def_phi);
7843       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7844                                              orig_code, initial_value);
7845     }
7846
7847   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7848     {
7849       /* We can't support in-order reductions of code such as this:
7850
7851            for (int i = 0; i < n1; ++i)
7852              for (int j = 0; j < n2; ++j)
7853                l += a[j];
7854
7855          since GCC effectively transforms the loop when vectorizing:
7856
7857            for (int i = 0; i < n1 / VF; ++i)
7858              for (int j = 0; j < n2; ++j)
7859                for (int k = 0; k < VF; ++k)
7860                  l += a[j];
7861
7862          which is a reassociation of the original operation.  */
7863       if (dump_enabled_p ())
7864         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7865                          "in-order double reduction not supported.\n");
7866
7867       return false;
7868     }
7869
7870   if (reduction_type == FOLD_LEFT_REDUCTION
7871       && slp_node
7872       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7873     {
7874       /* We cannot use in-order reductions in this case because there is
7875          an implicit reassociation of the operations involved.  */
7876       if (dump_enabled_p ())
7877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7878                          "in-order unchained SLP reductions not supported.\n");
7879       return false;
7880     }
7881
7882   /* For double reductions, and for SLP reductions with a neutral value,
7883      we construct a variable-length initial vector by loading a vector
7884      full of the neutral value and then shift-and-inserting the start
7885      values into the low-numbered elements.  */
7886   if ((double_reduc || neutral_op)
7887       && !nunits_out.is_constant ()
7888       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7889                                           vectype_out, OPTIMIZE_FOR_SPEED))
7890     {
7891       if (dump_enabled_p ())
7892         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7893                          "reduction on variable-length vectors requires"
7894                          " target support for a vector-shift-and-insert"
7895                          " operation.\n");
7896       return false;
7897     }
7898
7899   /* Check extra constraints for variable-length unchained SLP reductions.  */
7900   if (slp_node
7901       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7902       && !nunits_out.is_constant ())
7903     {
7904       /* We checked above that we could build the initial vector when
7905          there's a neutral element value.  Check here for the case in
7906          which each SLP statement has its own initial value and in which
7907          that value needs to be repeated for every instance of the
7908          statement within the initial vector.  */
7909       unsigned int group_size = SLP_TREE_LANES (slp_node);
7910       if (!neutral_op
7911           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7912                                               TREE_TYPE (vectype_out)))
7913         {
7914           if (dump_enabled_p ())
7915             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7916                              "unsupported form of SLP reduction for"
7917                              " variable-length vectors: cannot build"
7918                              " initial vector.\n");
7919           return false;
7920         }
7921       /* The epilogue code relies on the number of elements being a multiple
7922          of the group size.  The duplicate-and-interleave approach to setting
7923          up the initial vector does too.  */
7924       if (!multiple_p (nunits_out, group_size))
7925         {
7926           if (dump_enabled_p ())
7927             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928                              "unsupported form of SLP reduction for"
7929                              " variable-length vectors: the vector size"
7930                              " is not a multiple of the number of results.\n");
7931           return false;
7932         }
7933     }
7934
7935   if (reduction_type == COND_REDUCTION)
7936     {
7937       widest_int ni;
7938
7939       if (! max_loop_iterations (loop, &ni))
7940         {
7941           if (dump_enabled_p ())
7942             dump_printf_loc (MSG_NOTE, vect_location,
7943                              "loop count not known, cannot create cond "
7944                              "reduction.\n");
7945           return false;
7946         }
7947       /* Convert backedges to iterations.  */
7948       ni += 1;
7949
7950       /* The additional index will be the same type as the condition.  Check
7951          that the loop can fit into this less one (because we'll use up the
7952          zero slot for when there are no matches).  */
7953       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7954       if (wi::geu_p (ni, wi::to_widest (max_index)))
7955         {
7956           if (dump_enabled_p ())
7957             dump_printf_loc (MSG_NOTE, vect_location,
7958                              "loop size is greater than data size.\n");
7959           return false;
7960         }
7961     }
7962
7963   /* In case the vectorization factor (VF) is bigger than the number
7964      of elements that we can fit in a vectype (nunits), we have to generate
7965      more than one vector stmt - i.e - we need to "unroll" the
7966      vector stmt by a factor VF/nunits.  For more details see documentation
7967      in vectorizable_operation.  */
7968
7969   /* If the reduction is used in an outer loop we need to generate
7970      VF intermediate results, like so (e.g. for ncopies=2):
7971         r0 = phi (init, r0)
7972         r1 = phi (init, r1)
7973         r0 = x0 + r0;
7974         r1 = x1 + r1;
7975     (i.e. we generate VF results in 2 registers).
7976     In this case we have a separate def-use cycle for each copy, and therefore
7977     for each copy we get the vector def for the reduction variable from the
7978     respective phi node created for this copy.
7979
7980     Otherwise (the reduction is unused in the loop nest), we can combine
7981     together intermediate results, like so (e.g. for ncopies=2):
7982         r = phi (init, r)
7983         r = x0 + r;
7984         r = x1 + r;
7985    (i.e. we generate VF/2 results in a single register).
7986    In this case for each copy we get the vector def for the reduction variable
7987    from the vectorized reduction operation generated in the previous iteration.
7988
7989    This only works when we see both the reduction PHI and its only consumer
7990    in vectorizable_reduction and there are no intermediate stmts
7991    participating.  When unrolling we want each unrolled iteration to have its
7992    own reduction accumulator since one of the main goals of unrolling a
7993    reduction is to reduce the aggregate loop-carried latency.  */
7994   if (ncopies > 1
7995       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7996       && reduc_chain_length == 1
7997       && loop_vinfo->suggested_unroll_factor == 1)
7998     single_defuse_cycle = true;
7999
8000   if (single_defuse_cycle || lane_reduc_code_p)
8001     {
8002       gcc_assert (op.code != COND_EXPR);
8003
8004       /* 4. Supportable by target?  */
8005       bool ok = true;
8006
8007       /* 4.1. check support for the operation in the loop
8008
8009          This isn't necessary for the lane reduction codes, since they
8010          can only be produced by pattern matching, and it's up to the
8011          pattern matcher to test for support.  The main reason for
8012          specifically skipping this step is to avoid rechecking whether
8013          mixed-sign dot-products can be implemented using signed
8014          dot-products.  */
8015       machine_mode vec_mode = TYPE_MODE (vectype_in);
8016       if (!lane_reduc_code_p
8017           && !directly_supported_p (op.code, vectype_in, optab_vector))
8018         {
8019           if (dump_enabled_p ())
8020             dump_printf (MSG_NOTE, "op not supported by target.\n");
8021           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8022               || !vect_can_vectorize_without_simd_p (op.code))
8023             ok = false;
8024           else
8025             if (dump_enabled_p ())
8026               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8027         }
8028
8029       if (vect_emulated_vector_p (vectype_in)
8030           && !vect_can_vectorize_without_simd_p (op.code))
8031         {
8032           if (dump_enabled_p ())
8033             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8034           return false;
8035         }
8036
8037       /* lane-reducing operations have to go through vect_transform_reduction.
8038          For the other cases try without the single cycle optimization.  */
8039       if (!ok)
8040         {
8041           if (lane_reduc_code_p)
8042             return false;
8043           else
8044             single_defuse_cycle = false;
8045         }
8046     }
8047   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8048
8049   /* If the reduction stmt is one of the patterns that have lane
8050      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8051   if ((ncopies > 1 && ! single_defuse_cycle)
8052       && lane_reduc_code_p)
8053     {
8054       if (dump_enabled_p ())
8055         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8056                          "multi def-use cycle not possible for lane-reducing "
8057                          "reduction operation\n");
8058       return false;
8059     }
8060
8061   if (slp_node
8062       && !(!single_defuse_cycle
8063            && !lane_reduc_code_p
8064            && reduction_type != FOLD_LEFT_REDUCTION))
8065     for (i = 0; i < (int) op.num_ops; i++)
8066       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8067         {
8068           if (dump_enabled_p ())
8069             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070                              "incompatible vector types for invariants\n");
8071           return false;
8072         }
8073
8074   if (slp_node)
8075     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8076   else
8077     vec_num = 1;
8078
8079   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8080                              reduction_type, ncopies, cost_vec);
8081   /* Cost the reduction op inside the loop if transformed via
8082      vect_transform_reduction.  Otherwise this is costed by the
8083      separate vectorizable_* routines.  */
8084   if (single_defuse_cycle || lane_reduc_code_p)
8085     {
8086       int factor = 1;
8087       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8088         /* Three dot-products and a subtraction.  */
8089         factor = 4;
8090       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8091                         stmt_info, 0, vect_body);
8092     }
8093
8094   if (dump_enabled_p ()
8095       && reduction_type == FOLD_LEFT_REDUCTION)
8096     dump_printf_loc (MSG_NOTE, vect_location,
8097                      "using an in-order (fold-left) reduction.\n");
8098   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8099   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8100      reductions go through their own vectorizable_* routines.  */
8101   if (!single_defuse_cycle
8102       && !lane_reduc_code_p
8103       && reduction_type != FOLD_LEFT_REDUCTION)
8104     {
8105       stmt_vec_info tem
8106         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8107       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8108         {
8109           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8110           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8111         }
8112       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8113       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8114     }
8115   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8116     {
8117       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8118       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8119       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8120
8121       if (reduction_type != FOLD_LEFT_REDUCTION
8122           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8123           && (cond_fn == IFN_LAST
8124               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8125                                                   OPTIMIZE_FOR_SPEED)))
8126         {
8127           if (dump_enabled_p ())
8128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8129                              "can't operate on partial vectors because"
8130                              " no conditional operation is available.\n");
8131           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8132         }
8133       else if (reduction_type == FOLD_LEFT_REDUCTION
8134                && reduc_fn == IFN_LAST
8135                && !expand_vec_cond_expr_p (vectype_in,
8136                                            truth_type_for (vectype_in),
8137                                            SSA_NAME))
8138         {
8139           if (dump_enabled_p ())
8140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8141                              "can't operate on partial vectors because"
8142                              " no conditional operation is available.\n");
8143           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8144         }
8145       else if (reduction_type == FOLD_LEFT_REDUCTION
8146                && reduc_fn == IFN_LAST
8147                && FLOAT_TYPE_P (vectype_in)
8148                && HONOR_SIGNED_ZEROS (vectype_in)
8149                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8150         {
8151           if (dump_enabled_p ())
8152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8153                              "can't operate on partial vectors because"
8154                              " signed zeros cannot be preserved.\n");
8155           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8156         }
8157       else
8158         {
8159           internal_fn mask_reduc_fn
8160             = get_masked_reduction_fn (reduc_fn, vectype_in);
8161
8162           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8163             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8164                                   vectype_in, 1);
8165           else
8166             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8167                                    vectype_in, NULL);
8168         }
8169     }
8170   return true;
8171 }
8172
8173 /* STMT_INFO is a dot-product reduction whose multiplication operands
8174    have different signs.  Emit a sequence to emulate the operation
8175    using a series of signed DOT_PROD_EXPRs and return the last
8176    statement generated.  VEC_DEST is the result of the vector operation
8177    and VOP lists its inputs.  */
8178
8179 static gassign *
8180 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8181                              gimple_stmt_iterator *gsi, tree vec_dest,
8182                              tree vop[3])
8183 {
8184   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8185   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8186   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8187   gimple *new_stmt;
8188
8189   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8190   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8191     std::swap (vop[0], vop[1]);
8192
8193   /* Convert all inputs to signed types.  */
8194   for (int i = 0; i < 3; ++i)
8195     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8196       {
8197         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8198         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8199         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8200         vop[i] = tmp;
8201       }
8202
8203   /* In the comments below we assume 8-bit inputs for simplicity,
8204      but the approach works for any full integer type.  */
8205
8206   /* Create a vector of -128.  */
8207   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8208   tree min_narrow = build_vector_from_val (narrow_vectype,
8209                                            min_narrow_elttype);
8210
8211   /* Create a vector of 64.  */
8212   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8213   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8214   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8215
8216   /* Emit: SUB_RES = VOP[0] - 128.  */
8217   tree sub_res = make_ssa_name (narrow_vectype);
8218   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8219   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8220
8221   /* Emit:
8222
8223        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8224        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8225        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8226
8227      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8228      Doing the two 64 * y steps first allows more time to compute x.  */
8229   tree stage1 = make_ssa_name (wide_vectype);
8230   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8231                                   vop[1], half_narrow, vop[2]);
8232   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8233
8234   tree stage2 = make_ssa_name (wide_vectype);
8235   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8236                                   vop[1], half_narrow, stage1);
8237   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8238
8239   tree stage3 = make_ssa_name (wide_vectype);
8240   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8241                                   sub_res, vop[1], stage2);
8242   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8243
8244   /* Convert STAGE3 to the reduction type.  */
8245   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8246 }
8247
8248 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8249    value.  */
8250
8251 bool
8252 vect_transform_reduction (loop_vec_info loop_vinfo,
8253                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8254                           gimple **vec_stmt, slp_tree slp_node)
8255 {
8256   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8257   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8258   int i;
8259   int ncopies;
8260   int vec_num;
8261
8262   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8263   gcc_assert (reduc_info->is_reduc_info);
8264
8265   if (nested_in_vect_loop_p (loop, stmt_info))
8266     {
8267       loop = loop->inner;
8268       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8269     }
8270
8271   gimple_match_op op;
8272   if (!gimple_extract_op (stmt_info->stmt, &op))
8273     gcc_unreachable ();
8274
8275   /* All uses but the last are expected to be defined in the loop.
8276      The last use is the reduction variable.  In case of nested cycle this
8277      assumption is not true: we use reduc_index to record the index of the
8278      reduction variable.  */
8279   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8280   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8281   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8282   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8283
8284   if (slp_node)
8285     {
8286       ncopies = 1;
8287       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8288     }
8289   else
8290     {
8291       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8292       vec_num = 1;
8293     }
8294
8295   code_helper code = canonicalize_code (op.code, op.type);
8296   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8297   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8298   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8299   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8300
8301   /* Transform.  */
8302   tree new_temp = NULL_TREE;
8303   auto_vec<tree> vec_oprnds0;
8304   auto_vec<tree> vec_oprnds1;
8305   auto_vec<tree> vec_oprnds2;
8306   tree def0;
8307
8308   if (dump_enabled_p ())
8309     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8310
8311   /* FORNOW: Multiple types are not supported for condition.  */
8312   if (code == COND_EXPR)
8313     gcc_assert (ncopies == 1);
8314
8315   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8316
8317   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8318   if (reduction_type == FOLD_LEFT_REDUCTION)
8319     {
8320       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8321       gcc_assert (code.is_tree_code ());
8322       return vectorize_fold_left_reduction
8323           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8324            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8325            lens);
8326     }
8327
8328   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8329   gcc_assert (single_defuse_cycle
8330               || code == DOT_PROD_EXPR
8331               || code == WIDEN_SUM_EXPR
8332               || code == SAD_EXPR);
8333
8334   /* Create the destination vector  */
8335   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8336   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8337
8338   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8339                      single_defuse_cycle && reduc_index == 0
8340                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8341                      single_defuse_cycle && reduc_index == 1
8342                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8343                      op.num_ops == 3
8344                      && !(single_defuse_cycle && reduc_index == 2)
8345                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8346   if (single_defuse_cycle)
8347     {
8348       gcc_assert (!slp_node);
8349       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8350                                      op.ops[reduc_index],
8351                                      reduc_index == 0 ? &vec_oprnds0
8352                                      : (reduc_index == 1 ? &vec_oprnds1
8353                                         : &vec_oprnds2));
8354     }
8355
8356   bool emulated_mixed_dot_prod
8357     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8358   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8359     {
8360       gimple *new_stmt;
8361       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8362       if (masked_loop_p && !mask_by_cond_expr)
8363         {
8364           /* No conditional ifns have been defined for dot-product yet.  */
8365           gcc_assert (code != DOT_PROD_EXPR);
8366
8367           /* Make sure that the reduction accumulator is vop[0].  */
8368           if (reduc_index == 1)
8369             {
8370               gcc_assert (commutative_binary_op_p (code, op.type));
8371               std::swap (vop[0], vop[1]);
8372             }
8373           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8374                                           vec_num * ncopies, vectype_in, i);
8375           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8376                                                     vop[0], vop[1], vop[0]);
8377           new_temp = make_ssa_name (vec_dest, call);
8378           gimple_call_set_lhs (call, new_temp);
8379           gimple_call_set_nothrow (call, true);
8380           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8381           new_stmt = call;
8382         }
8383       else
8384         {
8385           if (op.num_ops == 3)
8386             vop[2] = vec_oprnds2[i];
8387
8388           if (masked_loop_p && mask_by_cond_expr)
8389             {
8390               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8391                                               vec_num * ncopies, vectype_in, i);
8392               build_vect_cond_expr (code, vop, mask, gsi);
8393             }
8394
8395           if (emulated_mixed_dot_prod)
8396             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8397                                                     vec_dest, vop);
8398           else if (code.is_internal_fn ())
8399             new_stmt = gimple_build_call_internal (internal_fn (code),
8400                                                    op.num_ops,
8401                                                    vop[0], vop[1], vop[2]);
8402           else
8403             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8404                                             vop[0], vop[1], vop[2]);
8405           new_temp = make_ssa_name (vec_dest, new_stmt);
8406           gimple_set_lhs (new_stmt, new_temp);
8407           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8408         }
8409
8410       if (slp_node)
8411         slp_node->push_vec_def (new_stmt);
8412       else if (single_defuse_cycle
8413                && i < ncopies - 1)
8414         {
8415           if (reduc_index == 0)
8416             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8417           else if (reduc_index == 1)
8418             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8419           else if (reduc_index == 2)
8420             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8421         }
8422       else
8423         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8424     }
8425
8426   if (!slp_node)
8427     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8428
8429   return true;
8430 }
8431
8432 /* Transform phase of a cycle PHI.  */
8433
8434 bool
8435 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8436                           stmt_vec_info stmt_info, gimple **vec_stmt,
8437                           slp_tree slp_node, slp_instance slp_node_instance)
8438 {
8439   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8440   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8441   int i;
8442   int ncopies;
8443   int j;
8444   bool nested_cycle = false;
8445   int vec_num;
8446
8447   if (nested_in_vect_loop_p (loop, stmt_info))
8448     {
8449       loop = loop->inner;
8450       nested_cycle = true;
8451     }
8452
8453   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8454   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8455   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8456   gcc_assert (reduc_info->is_reduc_info);
8457
8458   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8459       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8460     /* Leave the scalar phi in place.  */
8461     return true;
8462
8463   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8464   /* For a nested cycle we do not fill the above.  */
8465   if (!vectype_in)
8466     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8467   gcc_assert (vectype_in);
8468
8469   if (slp_node)
8470     {
8471       /* The size vect_schedule_slp_instance computes is off for us.  */
8472       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8473                                       * SLP_TREE_LANES (slp_node), vectype_in);
8474       ncopies = 1;
8475     }
8476   else
8477     {
8478       vec_num = 1;
8479       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8480     }
8481
8482   /* Check whether we should use a single PHI node and accumulate
8483      vectors to one before the backedge.  */
8484   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8485     ncopies = 1;
8486
8487   /* Create the destination vector  */
8488   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8489   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8490                                                vectype_out);
8491
8492   /* Get the loop-entry arguments.  */
8493   tree vec_initial_def = NULL_TREE;
8494   auto_vec<tree> vec_initial_defs;
8495   if (slp_node)
8496     {
8497       vec_initial_defs.reserve (vec_num);
8498       if (nested_cycle)
8499         {
8500           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8501           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8502                              &vec_initial_defs);
8503         }
8504       else
8505         {
8506           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8507           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8508           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8509
8510           unsigned int num_phis = stmts.length ();
8511           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8512             num_phis = 1;
8513           initial_values.reserve (num_phis);
8514           for (unsigned int i = 0; i < num_phis; ++i)
8515             {
8516               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8517               initial_values.quick_push (vect_phi_initial_value (this_phi));
8518             }
8519           if (vec_num == 1)
8520             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8521           if (!initial_values.is_empty ())
8522             {
8523               tree initial_value
8524                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8525               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8526               tree neutral_op
8527                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8528                                             code, initial_value);
8529               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8530                                               &vec_initial_defs, vec_num,
8531                                               stmts.length (), neutral_op);
8532             }
8533         }
8534     }
8535   else
8536     {
8537       /* Get at the scalar def before the loop, that defines the initial
8538          value of the reduction variable.  */
8539       tree initial_def = vect_phi_initial_value (phi);
8540       reduc_info->reduc_initial_values.safe_push (initial_def);
8541       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8542          and we can't use zero for induc_val, use initial_def.  Similarly
8543          for REDUC_MIN and initial_def larger than the base.  */
8544       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8545         {
8546           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8547           if (TREE_CODE (initial_def) == INTEGER_CST
8548               && !integer_zerop (induc_val)
8549               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8550                    && tree_int_cst_lt (initial_def, induc_val))
8551                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8552                       && tree_int_cst_lt (induc_val, initial_def))))
8553             {
8554               induc_val = initial_def;
8555               /* Communicate we used the initial_def to epilouge
8556                  generation.  */
8557               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8558             }
8559           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8560         }
8561       else if (nested_cycle)
8562         {
8563           /* Do not use an adjustment def as that case is not supported
8564              correctly if ncopies is not one.  */
8565           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8566                                          ncopies, initial_def,
8567                                          &vec_initial_defs);
8568         }
8569       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8570                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8571         /* Fill the initial vector with the initial scalar value.  */
8572         vec_initial_def
8573           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8574                                            initial_def, initial_def);
8575       else
8576         {
8577           if (ncopies == 1)
8578             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8579           if (!reduc_info->reduc_initial_values.is_empty ())
8580             {
8581               initial_def = reduc_info->reduc_initial_values[0];
8582               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8583               tree neutral_op
8584                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8585                                             code, initial_def);
8586               gcc_assert (neutral_op);
8587               /* Try to simplify the vector initialization by applying an
8588                  adjustment after the reduction has been performed.  */
8589               if (!reduc_info->reused_accumulator
8590                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8591                   && !operand_equal_p (neutral_op, initial_def))
8592                 {
8593                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8594                     = initial_def;
8595                   initial_def = neutral_op;
8596                 }
8597               vec_initial_def
8598                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8599                                                  initial_def, neutral_op);
8600             }
8601         }
8602     }
8603
8604   if (vec_initial_def)
8605     {
8606       vec_initial_defs.create (ncopies);
8607       for (i = 0; i < ncopies; ++i)
8608         vec_initial_defs.quick_push (vec_initial_def);
8609     }
8610
8611   if (auto *accumulator = reduc_info->reused_accumulator)
8612     {
8613       tree def = accumulator->reduc_input;
8614       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8615         {
8616           unsigned int nreduc;
8617           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8618                                             (TREE_TYPE (def)),
8619                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8620                                           &nreduc);
8621           gcc_assert (res);
8622           gimple_seq stmts = NULL;
8623           /* Reduce the single vector to a smaller one.  */
8624           if (nreduc != 1)
8625             {
8626               /* Perform the reduction in the appropriate type.  */
8627               tree rvectype = vectype_out;
8628               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8629                                               TREE_TYPE (TREE_TYPE (def))))
8630                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8631                                               TYPE_VECTOR_SUBPARTS
8632                                                 (vectype_out));
8633               def = vect_create_partial_epilog (def, rvectype,
8634                                                 STMT_VINFO_REDUC_CODE
8635                                                   (reduc_info),
8636                                                 &stmts);
8637             }
8638           /* The epilogue loop might use a different vector mode, like
8639              VNx2DI vs. V2DI.  */
8640           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8641             {
8642               tree reduc_type = build_vector_type_for_mode
8643                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8644               def = gimple_convert (&stmts, reduc_type, def);
8645             }
8646           /* Adjust the input so we pick up the partially reduced value
8647              for the skip edge in vect_create_epilog_for_reduction.  */
8648           accumulator->reduc_input = def;
8649           /* And the reduction could be carried out using a different sign.  */
8650           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8651             def = gimple_convert (&stmts, vectype_out, def);
8652           if (loop_vinfo->main_loop_edge)
8653             {
8654               /* While we'd like to insert on the edge this will split
8655                  blocks and disturb bookkeeping, we also will eventually
8656                  need this on the skip edge.  Rely on sinking to
8657                  fixup optimal placement and insert in the pred.  */
8658               gimple_stmt_iterator gsi
8659                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8660               /* Insert before a cond that eventually skips the
8661                  epilogue.  */
8662               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8663                 gsi_prev (&gsi);
8664               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8665             }
8666           else
8667             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8668                                               stmts);
8669         }
8670       if (loop_vinfo->main_loop_edge)
8671         vec_initial_defs[0]
8672           = vect_get_main_loop_result (loop_vinfo, def,
8673                                        vec_initial_defs[0]);
8674       else
8675         vec_initial_defs.safe_push (def);
8676     }
8677
8678   /* Generate the reduction PHIs upfront.  */
8679   for (i = 0; i < vec_num; i++)
8680     {
8681       tree vec_init_def = vec_initial_defs[i];
8682       for (j = 0; j < ncopies; j++)
8683         {
8684           /* Create the reduction-phi that defines the reduction
8685              operand.  */
8686           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8687
8688           /* Set the loop-entry arg of the reduction-phi.  */
8689           if (j != 0 && nested_cycle)
8690             vec_init_def = vec_initial_defs[j];
8691           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8692                        UNKNOWN_LOCATION);
8693
8694           /* The loop-latch arg is set in epilogue processing.  */
8695
8696           if (slp_node)
8697             slp_node->push_vec_def (new_phi);
8698           else
8699             {
8700               if (j == 0)
8701                 *vec_stmt = new_phi;
8702               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8703             }
8704         }
8705     }
8706
8707   return true;
8708 }
8709
8710 /* Vectorizes LC PHIs.  */
8711
8712 bool
8713 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8714                      stmt_vec_info stmt_info, gimple **vec_stmt,
8715                      slp_tree slp_node)
8716 {
8717   if (!loop_vinfo
8718       || !is_a <gphi *> (stmt_info->stmt)
8719       || gimple_phi_num_args (stmt_info->stmt) != 1)
8720     return false;
8721
8722   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8723       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8724     return false;
8725
8726   if (!vec_stmt) /* transformation not required.  */
8727     {
8728       /* Deal with copies from externs or constants that disguise as
8729          loop-closed PHI nodes (PR97886).  */
8730       if (slp_node
8731           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8732                                                 SLP_TREE_VECTYPE (slp_node)))
8733         {
8734           if (dump_enabled_p ())
8735             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8736                              "incompatible vector types for invariants\n");
8737           return false;
8738         }
8739       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8740       return true;
8741     }
8742
8743   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8744   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8745   basic_block bb = gimple_bb (stmt_info->stmt);
8746   edge e = single_pred_edge (bb);
8747   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8748   auto_vec<tree> vec_oprnds;
8749   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8750                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8751                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8752   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8753     {
8754       /* Create the vectorized LC PHI node.  */
8755       gphi *new_phi = create_phi_node (vec_dest, bb);
8756       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8757       if (slp_node)
8758         slp_node->push_vec_def (new_phi);
8759       else
8760         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8761     }
8762   if (!slp_node)
8763     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8764
8765   return true;
8766 }
8767
8768 /* Vectorizes PHIs.  */
8769
8770 bool
8771 vectorizable_phi (vec_info *,
8772                   stmt_vec_info stmt_info, gimple **vec_stmt,
8773                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8774 {
8775   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8776     return false;
8777
8778   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8779     return false;
8780
8781   tree vectype = SLP_TREE_VECTYPE (slp_node);
8782
8783   if (!vec_stmt) /* transformation not required.  */
8784     {
8785       slp_tree child;
8786       unsigned i;
8787       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8788         if (!child)
8789           {
8790             if (dump_enabled_p ())
8791               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8792                                "PHI node with unvectorized backedge def\n");
8793             return false;
8794           }
8795         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8796           {
8797             if (dump_enabled_p ())
8798               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8799                                "incompatible vector types for invariants\n");
8800             return false;
8801           }
8802         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8803                  && !useless_type_conversion_p (vectype,
8804                                                 SLP_TREE_VECTYPE (child)))
8805           {
8806             /* With bools we can have mask and non-mask precision vectors
8807                or different non-mask precisions.  while pattern recog is
8808                supposed to guarantee consistency here bugs in it can cause
8809                mismatches (PR103489 and PR103800 for example).
8810                Deal with them here instead of ICEing later.  */
8811             if (dump_enabled_p ())
8812               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8813                                "incompatible vector type setup from "
8814                                "bool pattern detection\n");
8815             return false;
8816           }
8817
8818       /* For single-argument PHIs assume coalescing which means zero cost
8819          for the scalar and the vector PHIs.  This avoids artificially
8820          favoring the vector path (but may pessimize it in some cases).  */
8821       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8822         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8823                           vector_stmt, stmt_info, vectype, 0, vect_body);
8824       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8825       return true;
8826     }
8827
8828   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8829   basic_block bb = gimple_bb (stmt_info->stmt);
8830   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8831   auto_vec<gphi *> new_phis;
8832   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8833     {
8834       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8835
8836       /* Skip not yet vectorized defs.  */
8837       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8838           && SLP_TREE_VEC_DEFS (child).is_empty ())
8839         continue;
8840
8841       auto_vec<tree> vec_oprnds;
8842       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8843       if (!new_phis.exists ())
8844         {
8845           new_phis.create (vec_oprnds.length ());
8846           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8847             {
8848               /* Create the vectorized LC PHI node.  */
8849               new_phis.quick_push (create_phi_node (vec_dest, bb));
8850               slp_node->push_vec_def (new_phis[j]);
8851             }
8852         }
8853       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8854       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8855         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8856     }
8857   /* We should have at least one already vectorized child.  */
8858   gcc_assert (new_phis.exists ());
8859
8860   return true;
8861 }
8862
8863 /* Vectorizes first order recurrences.  An overview of the transformation
8864    is described below. Suppose we have the following loop.
8865
8866      int t = 0;
8867      for (int i = 0; i < n; ++i)
8868        {
8869          b[i] = a[i] - t;
8870          t = a[i];
8871        }
8872
8873    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8874    looks (simplified) like:
8875
8876     scalar.preheader:
8877       init = 0;
8878
8879     scalar.body:
8880       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8881       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8882       _1 = a[i]
8883       b[i] = _1 - _2
8884       if (i < n) goto scalar.body
8885
8886    In this example, _2 is a recurrence because it's value depends on the
8887    previous iteration.  We vectorize this as (VF = 4)
8888
8889     vector.preheader:
8890       vect_init = vect_cst(..., ..., ..., 0)
8891
8892     vector.body
8893       i = PHI <0(vector.preheader), i+4(vector.body)>
8894       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8895       vect_2 = a[i, i+1, i+2, i+3];
8896       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8897       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8898       if (..) goto vector.body
8899
8900    In this function, vectorizable_recurr, we code generate both the
8901    vector PHI node and the permute since those together compute the
8902    vectorized value of the scalar PHI.  We do not yet have the
8903    backedge value to fill in there nor into the vec_perm.  Those
8904    are filled in maybe_set_vectorized_backedge_value and
8905    vect_schedule_scc.
8906
8907    TODO:  Since the scalar loop does not have a use of the recurrence
8908    outside of the loop the natural way to implement peeling via
8909    vectorizing the live value doesn't work.  For now peeling of loops
8910    with a recurrence is not implemented.  For SLP the supported cases
8911    are restricted to those requiring a single vector recurrence PHI.  */
8912
8913 bool
8914 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8915                      gimple **vec_stmt, slp_tree slp_node,
8916                      stmt_vector_for_cost *cost_vec)
8917 {
8918   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8919     return false;
8920
8921   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8922
8923   /* So far we only support first-order recurrence auto-vectorization.  */
8924   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8925     return false;
8926
8927   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8928   unsigned ncopies;
8929   if (slp_node)
8930     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8931   else
8932     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8933   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8934   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8935   /* We need to be able to make progress with a single vector.  */
8936   if (maybe_gt (dist * 2, nunits))
8937     {
8938       if (dump_enabled_p ())
8939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8940                          "first order recurrence exceeds half of "
8941                          "a vector\n");
8942       return false;
8943     }
8944
8945   /* First-order recurrence autovectorization needs to handle permutation
8946      with indices = [nunits-1, nunits, nunits+1, ...].  */
8947   vec_perm_builder sel (nunits, 1, 3);
8948   for (int i = 0; i < 3; ++i)
8949     sel.quick_push (nunits - dist + i);
8950   vec_perm_indices indices (sel, 2, nunits);
8951
8952   if (!vec_stmt) /* transformation not required.  */
8953     {
8954       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8955                                  indices))
8956         return false;
8957
8958       if (slp_node)
8959         {
8960           /* We eventually need to set a vector type on invariant
8961              arguments.  */
8962           unsigned j;
8963           slp_tree child;
8964           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8965             if (!vect_maybe_update_slp_op_vectype
8966                   (child, SLP_TREE_VECTYPE (slp_node)))
8967               {
8968                 if (dump_enabled_p ())
8969                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8970                                    "incompatible vector types for "
8971                                    "invariants\n");
8972                 return false;
8973               }
8974         }
8975       /* The recurrence costs the initialization vector and one permute
8976          for each copy.  */
8977       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8978                                                  stmt_info, 0, vect_prologue);
8979       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8980                                                stmt_info, 0, vect_body);
8981       if (dump_enabled_p ())
8982         dump_printf_loc (MSG_NOTE, vect_location,
8983                          "vectorizable_recurr: inside_cost = %d, "
8984                          "prologue_cost = %d .\n", inside_cost,
8985                          prologue_cost);
8986
8987       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8988       return true;
8989     }
8990
8991   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8992   basic_block bb = gimple_bb (phi);
8993   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8994   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8995     {
8996       gimple_seq stmts = NULL;
8997       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8998       gsi_insert_seq_on_edge_immediate (pe, stmts);
8999     }
9000   tree vec_init = build_vector_from_val (vectype, preheader);
9001   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9002
9003   /* Create the vectorized first-order PHI node.  */
9004   tree vec_dest = vect_get_new_vect_var (vectype,
9005                                          vect_simple_var, "vec_recur_");
9006   gphi *new_phi = create_phi_node (vec_dest, bb);
9007   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9008
9009   /* Insert shuffles the first-order recurrence autovectorization.
9010        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9011   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9012
9013   /* Insert the required permute after the latch definition.  The
9014      second and later operands are tentative and will be updated when we have
9015      vectorized the latch definition.  */
9016   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9017   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9018   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9019   gsi_next (&gsi2);
9020
9021   for (unsigned i = 0; i < ncopies; ++i)
9022     {
9023       vec_dest = make_ssa_name (vectype);
9024       gassign *vperm
9025           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9026                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9027                                  NULL, perm);
9028       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9029
9030       if (slp_node)
9031         slp_node->push_vec_def (vperm);
9032       else
9033         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9034     }
9035
9036   if (!slp_node)
9037     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9038   return true;
9039 }
9040
9041 /* Return true if VECTYPE represents a vector that requires lowering
9042    by the vector lowering pass.  */
9043
9044 bool
9045 vect_emulated_vector_p (tree vectype)
9046 {
9047   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9048           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9049               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9050 }
9051
9052 /* Return true if we can emulate CODE on an integer mode representation
9053    of a vector.  */
9054
9055 bool
9056 vect_can_vectorize_without_simd_p (tree_code code)
9057 {
9058   switch (code)
9059     {
9060     case PLUS_EXPR:
9061     case MINUS_EXPR:
9062     case NEGATE_EXPR:
9063     case BIT_AND_EXPR:
9064     case BIT_IOR_EXPR:
9065     case BIT_XOR_EXPR:
9066     case BIT_NOT_EXPR:
9067       return true;
9068
9069     default:
9070       return false;
9071     }
9072 }
9073
9074 /* Likewise, but taking a code_helper.  */
9075
9076 bool
9077 vect_can_vectorize_without_simd_p (code_helper code)
9078 {
9079   return (code.is_tree_code ()
9080           && vect_can_vectorize_without_simd_p (tree_code (code)));
9081 }
9082
9083 /* Create vector init for vectorized iv.  */
9084 static tree
9085 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9086                                tree step_expr, poly_uint64 nunits,
9087                                tree vectype,
9088                                enum vect_induction_op_type induction_type)
9089 {
9090   unsigned HOST_WIDE_INT const_nunits;
9091   tree vec_shift, vec_init, new_name;
9092   unsigned i;
9093   tree itype = TREE_TYPE (vectype);
9094
9095   /* iv_loop is the loop to be vectorized. Create:
9096      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9097   new_name = gimple_convert (stmts, itype, init_expr);
9098   switch (induction_type)
9099     {
9100     case vect_step_op_shr:
9101     case vect_step_op_shl:
9102       /* Build the Initial value from shift_expr.  */
9103       vec_init = gimple_build_vector_from_val (stmts,
9104                                                vectype,
9105                                                new_name);
9106       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9107                                 build_zero_cst (itype), step_expr);
9108       vec_init = gimple_build (stmts,
9109                                (induction_type == vect_step_op_shr
9110                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9111                                vectype, vec_init, vec_shift);
9112       break;
9113
9114     case vect_step_op_neg:
9115       {
9116         vec_init = gimple_build_vector_from_val (stmts,
9117                                                  vectype,
9118                                                  new_name);
9119         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9120                                      vectype, vec_init);
9121         /* The encoding has 2 interleaved stepped patterns.  */
9122         vec_perm_builder sel (nunits, 2, 3);
9123         sel.quick_grow (6);
9124         for (i = 0; i < 3; i++)
9125           {
9126             sel[2 * i] = i;
9127             sel[2 * i + 1] = i + nunits;
9128           }
9129         vec_perm_indices indices (sel, 2, nunits);
9130         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9131            fail when vec_init is const vector. In that situation vec_perm is not
9132            really needed.  */
9133         tree perm_mask_even
9134           = vect_gen_perm_mask_any (vectype, indices);
9135         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9136                                  vectype,
9137                                  vec_init, vec_neg,
9138                                  perm_mask_even);
9139       }
9140       break;
9141
9142     case vect_step_op_mul:
9143       {
9144         /* Use unsigned mult to avoid UD integer overflow.  */
9145         gcc_assert (nunits.is_constant (&const_nunits));
9146         tree utype = unsigned_type_for (itype);
9147         tree uvectype = build_vector_type (utype,
9148                                            TYPE_VECTOR_SUBPARTS (vectype));
9149         new_name = gimple_convert (stmts, utype, new_name);
9150         vec_init = gimple_build_vector_from_val (stmts,
9151                                                  uvectype,
9152                                                  new_name);
9153         tree_vector_builder elts (uvectype, const_nunits, 1);
9154         tree elt_step = build_one_cst (utype);
9155
9156         elts.quick_push (elt_step);
9157         for (i = 1; i < const_nunits; i++)
9158           {
9159             /* Create: new_name_i = new_name + step_expr.  */
9160             elt_step = gimple_build (stmts, MULT_EXPR,
9161                                      utype, elt_step, step_expr);
9162             elts.quick_push (elt_step);
9163           }
9164         /* Create a vector from [new_name_0, new_name_1, ...,
9165            new_name_nunits-1].  */
9166         tree vec_mul = gimple_build_vector (stmts, &elts);
9167         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9168                                  vec_init, vec_mul);
9169         vec_init = gimple_convert (stmts, vectype, vec_init);
9170       }
9171       break;
9172
9173     default:
9174       gcc_unreachable ();
9175     }
9176
9177   return vec_init;
9178 }
9179
9180 /* Peel init_expr by skip_niter for induction_type.  */
9181 tree
9182 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9183                              tree skip_niters, tree step_expr,
9184                              enum vect_induction_op_type induction_type)
9185 {
9186   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9187   tree type = TREE_TYPE (init_expr);
9188   unsigned prec = TYPE_PRECISION (type);
9189   switch (induction_type)
9190     {
9191     case vect_step_op_neg:
9192       if (TREE_INT_CST_LOW (skip_niters) % 2)
9193         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9194       /* else no change.  */
9195       break;
9196
9197     case vect_step_op_shr:
9198     case vect_step_op_shl:
9199       skip_niters = gimple_convert (stmts, type, skip_niters);
9200       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9201       /* When shift mount >= precision, need to avoid UD.
9202          In the original loop, there's no UD, and according to semantic,
9203          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9204       if (!tree_fits_uhwi_p (step_expr)
9205           || tree_to_uhwi (step_expr) >= prec)
9206         {
9207           if (induction_type == vect_step_op_shl
9208               || TYPE_UNSIGNED (type))
9209             init_expr = build_zero_cst (type);
9210           else
9211             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9212                                       init_expr,
9213                                       wide_int_to_tree (type, prec - 1));
9214         }
9215       else
9216         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9217                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9218                                   type, init_expr, step_expr);
9219       break;
9220
9221     case vect_step_op_mul:
9222       {
9223         tree utype = unsigned_type_for (type);
9224         init_expr = gimple_convert (stmts, utype, init_expr);
9225         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9226         wide_int begin = wi::to_wide (step_expr);
9227         for (unsigned i = 0; i != skipn - 1; i++)
9228           begin = wi::mul (begin, wi::to_wide (step_expr));
9229         tree mult_expr = wide_int_to_tree (utype, begin);
9230         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9231         init_expr = gimple_convert (stmts, type, init_expr);
9232       }
9233       break;
9234
9235     default:
9236       gcc_unreachable ();
9237     }
9238
9239   return init_expr;
9240 }
9241
9242 /* Create vector step for vectorized iv.  */
9243 static tree
9244 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9245                                poly_uint64 vf,
9246                                enum vect_induction_op_type induction_type)
9247 {
9248   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9249   tree new_name = NULL;
9250   /* Step should be pow (step, vf) for mult induction.  */
9251   if (induction_type == vect_step_op_mul)
9252     {
9253       gcc_assert (vf.is_constant ());
9254       wide_int begin = wi::to_wide (step_expr);
9255
9256       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9257         begin = wi::mul (begin, wi::to_wide (step_expr));
9258
9259       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9260     }
9261   else if (induction_type == vect_step_op_neg)
9262     /* Do nothing.  */
9263     ;
9264   else
9265     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9266                              expr, step_expr);
9267   return new_name;
9268 }
9269
9270 static tree
9271 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9272                                    stmt_vec_info stmt_info,
9273                                    tree new_name, tree vectype,
9274                                    enum vect_induction_op_type induction_type)
9275 {
9276   /* No step is needed for neg induction.  */
9277   if (induction_type == vect_step_op_neg)
9278     return NULL;
9279
9280   tree t = unshare_expr (new_name);
9281   gcc_assert (CONSTANT_CLASS_P (new_name)
9282               || TREE_CODE (new_name) == SSA_NAME);
9283   tree new_vec = build_vector_from_val (vectype, t);
9284   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9285                                     new_vec, vectype, NULL);
9286   return vec_step;
9287 }
9288
9289 /* Update vectorized iv with vect_step, induc_def is init.  */
9290 static tree
9291 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9292                           tree induc_def, tree vec_step,
9293                           enum vect_induction_op_type induction_type)
9294 {
9295   tree vec_def = induc_def;
9296   switch (induction_type)
9297     {
9298     case vect_step_op_mul:
9299       {
9300         /* Use unsigned mult to avoid UD integer overflow.  */
9301         tree uvectype
9302           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9303                                TYPE_VECTOR_SUBPARTS (vectype));
9304         vec_def = gimple_convert (stmts, uvectype, vec_def);
9305         vec_step = gimple_convert (stmts, uvectype, vec_step);
9306         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9307                                 vec_def, vec_step);
9308         vec_def = gimple_convert (stmts, vectype, vec_def);
9309       }
9310       break;
9311
9312     case vect_step_op_shr:
9313       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9314                               vec_def, vec_step);
9315       break;
9316
9317     case vect_step_op_shl:
9318       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9319                               vec_def, vec_step);
9320       break;
9321     case vect_step_op_neg:
9322       vec_def = induc_def;
9323       /* Do nothing.  */
9324       break;
9325     default:
9326       gcc_unreachable ();
9327     }
9328
9329   return vec_def;
9330
9331 }
9332
9333 /* Function vectorizable_induction
9334
9335    Check if STMT_INFO performs an nonlinear induction computation that can be
9336    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9337    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9338    basic block.
9339    Return true if STMT_INFO is vectorizable in this way.  */
9340
9341 static bool
9342 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9343                                   stmt_vec_info stmt_info,
9344                                   gimple **vec_stmt, slp_tree slp_node,
9345                                   stmt_vector_for_cost *cost_vec)
9346 {
9347   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9348   unsigned ncopies;
9349   bool nested_in_vect_loop = false;
9350   class loop *iv_loop;
9351   tree vec_def;
9352   edge pe = loop_preheader_edge (loop);
9353   basic_block new_bb;
9354   tree vec_init, vec_step;
9355   tree new_name;
9356   gimple *new_stmt;
9357   gphi *induction_phi;
9358   tree induc_def, vec_dest;
9359   tree init_expr, step_expr;
9360   tree niters_skip;
9361   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9362   unsigned i;
9363   gimple_stmt_iterator si;
9364
9365   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9366
9367   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9368   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9369   enum vect_induction_op_type induction_type
9370     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9371
9372   gcc_assert (induction_type > vect_step_op_add);
9373
9374   if (slp_node)
9375     ncopies = 1;
9376   else
9377     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9378   gcc_assert (ncopies >= 1);
9379
9380   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9381   if (nested_in_vect_loop_p (loop, stmt_info))
9382     {
9383       if (dump_enabled_p ())
9384         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9385                          "nonlinear induction in nested loop.\n");
9386       return false;
9387     }
9388
9389   iv_loop = loop;
9390   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9391
9392   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9393      update for each iv and a permutation to generate wanted vector iv.  */
9394   if (slp_node)
9395     {
9396       if (dump_enabled_p ())
9397         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9398                          "SLP induction not supported for nonlinear"
9399                          " induction.\n");
9400       return false;
9401     }
9402
9403   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9404     {
9405       if (dump_enabled_p ())
9406         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9407                          "floating point nonlinear induction vectorization"
9408                          " not supported.\n");
9409       return false;
9410     }
9411
9412   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9413   init_expr = vect_phi_initial_value (phi);
9414   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9415               && TREE_CODE (step_expr) == INTEGER_CST);
9416   /* step_expr should be aligned with init_expr,
9417      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9418   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9419
9420   if (TREE_CODE (init_expr) == INTEGER_CST)
9421     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9422   else
9423     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9424                                        TREE_TYPE (init_expr)));
9425
9426   switch (induction_type)
9427     {
9428     case vect_step_op_neg:
9429       if (TREE_CODE (init_expr) != INTEGER_CST
9430           && TREE_CODE (init_expr) != REAL_CST)
9431         {
9432           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9433           if (!directly_supported_p (NEGATE_EXPR, vectype))
9434             return false;
9435
9436           /* The encoding has 2 interleaved stepped patterns.  */
9437           vec_perm_builder sel (nunits, 2, 3);
9438           machine_mode mode = TYPE_MODE (vectype);
9439           sel.quick_grow (6);
9440           for (i = 0; i < 3; i++)
9441             {
9442               sel[i * 2] = i;
9443               sel[i * 2 + 1] = i + nunits;
9444             }
9445           vec_perm_indices indices (sel, 2, nunits);
9446           if (!can_vec_perm_const_p (mode, mode, indices))
9447             return false;
9448         }
9449       break;
9450
9451     case vect_step_op_mul:
9452       {
9453         /* Check for backend support of MULT_EXPR.  */
9454         if (!directly_supported_p (MULT_EXPR, vectype))
9455           return false;
9456
9457         /* ?? How to construct vector step for variable number vector.
9458            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9459         if (!vf.is_constant ())
9460           return false;
9461       }
9462       break;
9463
9464     case vect_step_op_shr:
9465       /* Check for backend support of RSHIFT_EXPR.  */
9466       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9467         return false;
9468
9469       /* Don't shift more than type precision to avoid UD.  */
9470       if (!tree_fits_uhwi_p (step_expr)
9471           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9472                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9473         return false;
9474       break;
9475
9476     case vect_step_op_shl:
9477       /* Check for backend support of RSHIFT_EXPR.  */
9478       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9479         return false;
9480
9481       /* Don't shift more than type precision to avoid UD.  */
9482       if (!tree_fits_uhwi_p (step_expr)
9483           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9484                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9485         return false;
9486
9487       break;
9488
9489     default:
9490       gcc_unreachable ();
9491     }
9492
9493   if (!vec_stmt) /* transformation not required.  */
9494     {
9495       unsigned inside_cost = 0, prologue_cost = 0;
9496       /* loop cost for vec_loop. Neg induction doesn't have any
9497          inside_cost.  */
9498       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9499                                       stmt_info, 0, vect_body);
9500
9501       /* loop cost for vec_loop. Neg induction doesn't have any
9502          inside_cost.  */
9503       if (induction_type == vect_step_op_neg)
9504         inside_cost = 0;
9505
9506       /* prologue cost for vec_init and vec_step.  */
9507       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9508                                         stmt_info, 0, vect_prologue);
9509
9510       if (dump_enabled_p ())
9511         dump_printf_loc (MSG_NOTE, vect_location,
9512                          "vect_model_induction_cost: inside_cost = %d, "
9513                          "prologue_cost = %d. \n", inside_cost,
9514                          prologue_cost);
9515
9516       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9517       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9518       return true;
9519     }
9520
9521   /* Transform.  */
9522
9523   /* Compute a vector variable, initialized with the first VF values of
9524      the induction variable.  E.g., for an iv with IV_PHI='X' and
9525      evolution S, for a vector of 4 units, we want to compute:
9526      [X, X + S, X + 2*S, X + 3*S].  */
9527
9528   if (dump_enabled_p ())
9529     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9530
9531   pe = loop_preheader_edge (iv_loop);
9532   /* Find the first insertion point in the BB.  */
9533   basic_block bb = gimple_bb (phi);
9534   si = gsi_after_labels (bb);
9535
9536   gimple_seq stmts = NULL;
9537
9538   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9539   /* If we are using the loop mask to "peel" for alignment then we need
9540      to adjust the start value here.  */
9541   if (niters_skip != NULL_TREE)
9542     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9543                                              step_expr, induction_type);
9544
9545   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9546                                             step_expr, nunits, vectype,
9547                                             induction_type);
9548   if (stmts)
9549     {
9550       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9551       gcc_assert (!new_bb);
9552     }
9553
9554   stmts = NULL;
9555   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9556                                             vf, induction_type);
9557   if (stmts)
9558     {
9559       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9560       gcc_assert (!new_bb);
9561     }
9562
9563   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9564                                                 new_name, vectype,
9565                                                 induction_type);
9566   /* Create the following def-use cycle:
9567      loop prolog:
9568      vec_init = ...
9569      vec_step = ...
9570      loop:
9571      vec_iv = PHI <vec_init, vec_loop>
9572      ...
9573      STMT
9574      ...
9575      vec_loop = vec_iv + vec_step;  */
9576
9577   /* Create the induction-phi that defines the induction-operand.  */
9578   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9579   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9580   induc_def = PHI_RESULT (induction_phi);
9581
9582   /* Create the iv update inside the loop.  */
9583   stmts = NULL;
9584   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9585                                       induc_def, vec_step,
9586                                       induction_type);
9587
9588   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9589   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9590
9591   /* Set the arguments of the phi node:  */
9592   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9593   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9594                UNKNOWN_LOCATION);
9595
9596   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9597   *vec_stmt = induction_phi;
9598
9599   /* In case that vectorization factor (VF) is bigger than the number
9600      of elements that we can fit in a vectype (nunits), we have to generate
9601      more than one vector stmt - i.e - we need to "unroll" the
9602      vector stmt by a factor VF/nunits.  For more details see documentation
9603      in vectorizable_operation.  */
9604
9605   if (ncopies > 1)
9606     {
9607       stmts = NULL;
9608       /* FORNOW. This restriction should be relaxed.  */
9609       gcc_assert (!nested_in_vect_loop);
9610
9611       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9612                                                 nunits, induction_type);
9613
9614       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9615                                                     new_name, vectype,
9616                                                     induction_type);
9617       vec_def = induc_def;
9618       for (i = 1; i < ncopies; i++)
9619         {
9620           /* vec_i = vec_prev + vec_step.  */
9621           stmts = NULL;
9622           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9623                                               vec_def, vec_step,
9624                                               induction_type);
9625           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9626           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9627           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9628         }
9629     }
9630
9631   if (dump_enabled_p ())
9632     dump_printf_loc (MSG_NOTE, vect_location,
9633                      "transform induction: created def-use cycle: %G%G",
9634                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9635
9636   return true;
9637 }
9638
9639 /* Function vectorizable_induction
9640
9641    Check if STMT_INFO performs an induction computation that can be vectorized.
9642    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9643    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9644    Return true if STMT_INFO is vectorizable in this way.  */
9645
9646 bool
9647 vectorizable_induction (loop_vec_info loop_vinfo,
9648                         stmt_vec_info stmt_info,
9649                         gimple **vec_stmt, slp_tree slp_node,
9650                         stmt_vector_for_cost *cost_vec)
9651 {
9652   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9653   unsigned ncopies;
9654   bool nested_in_vect_loop = false;
9655   class loop *iv_loop;
9656   tree vec_def;
9657   edge pe = loop_preheader_edge (loop);
9658   basic_block new_bb;
9659   tree new_vec, vec_init, vec_step, t;
9660   tree new_name;
9661   gimple *new_stmt;
9662   gphi *induction_phi;
9663   tree induc_def, vec_dest;
9664   tree init_expr, step_expr;
9665   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9666   unsigned i;
9667   tree expr;
9668   gimple_stmt_iterator si;
9669   enum vect_induction_op_type induction_type
9670     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9671
9672   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9673   if (!phi)
9674     return false;
9675
9676   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9677     return false;
9678
9679   /* Make sure it was recognized as induction computation.  */
9680   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9681     return false;
9682
9683   /* Handle nonlinear induction in a separate place.  */
9684   if (induction_type != vect_step_op_add)
9685     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9686                                              vec_stmt, slp_node, cost_vec);
9687
9688   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9689   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9690
9691   if (slp_node)
9692     ncopies = 1;
9693   else
9694     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9695   gcc_assert (ncopies >= 1);
9696
9697   /* FORNOW. These restrictions should be relaxed.  */
9698   if (nested_in_vect_loop_p (loop, stmt_info))
9699     {
9700       imm_use_iterator imm_iter;
9701       use_operand_p use_p;
9702       gimple *exit_phi;
9703       edge latch_e;
9704       tree loop_arg;
9705
9706       if (ncopies > 1)
9707         {
9708           if (dump_enabled_p ())
9709             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9710                              "multiple types in nested loop.\n");
9711           return false;
9712         }
9713
9714       exit_phi = NULL;
9715       latch_e = loop_latch_edge (loop->inner);
9716       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9717       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9718         {
9719           gimple *use_stmt = USE_STMT (use_p);
9720           if (is_gimple_debug (use_stmt))
9721             continue;
9722
9723           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9724             {
9725               exit_phi = use_stmt;
9726               break;
9727             }
9728         }
9729       if (exit_phi)
9730         {
9731           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9732           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9733                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9734             {
9735               if (dump_enabled_p ())
9736                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9737                                  "inner-loop induction only used outside "
9738                                  "of the outer vectorized loop.\n");
9739               return false;
9740             }
9741         }
9742
9743       nested_in_vect_loop = true;
9744       iv_loop = loop->inner;
9745     }
9746   else
9747     iv_loop = loop;
9748   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9749
9750   if (slp_node && !nunits.is_constant ())
9751     {
9752       /* The current SLP code creates the step value element-by-element.  */
9753       if (dump_enabled_p ())
9754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9755                          "SLP induction not supported for variable-length"
9756                          " vectors.\n");
9757       return false;
9758     }
9759
9760   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9761     {
9762       if (dump_enabled_p ())
9763         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9764                          "floating point induction vectorization disabled\n");
9765       return false;
9766     }
9767
9768   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9769   gcc_assert (step_expr != NULL_TREE);
9770   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9771
9772   /* Check for backend support of PLUS/MINUS_EXPR. */
9773   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9774       || !directly_supported_p (MINUS_EXPR, step_vectype))
9775     return false;
9776
9777   if (!vec_stmt) /* transformation not required.  */
9778     {
9779       unsigned inside_cost = 0, prologue_cost = 0;
9780       if (slp_node)
9781         {
9782           /* We eventually need to set a vector type on invariant
9783              arguments.  */
9784           unsigned j;
9785           slp_tree child;
9786           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9787             if (!vect_maybe_update_slp_op_vectype
9788                 (child, SLP_TREE_VECTYPE (slp_node)))
9789               {
9790                 if (dump_enabled_p ())
9791                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9792                                    "incompatible vector types for "
9793                                    "invariants\n");
9794                 return false;
9795               }
9796           /* loop cost for vec_loop.  */
9797           inside_cost
9798             = record_stmt_cost (cost_vec,
9799                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9800                                 vector_stmt, stmt_info, 0, vect_body);
9801           /* prologue cost for vec_init (if not nested) and step.  */
9802           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9803                                             scalar_to_vec,
9804                                             stmt_info, 0, vect_prologue);
9805         }
9806       else /* if (!slp_node) */
9807         {
9808           /* loop cost for vec_loop.  */
9809           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9810                                           stmt_info, 0, vect_body);
9811           /* prologue cost for vec_init and vec_step.  */
9812           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9813                                             stmt_info, 0, vect_prologue);
9814         }
9815       if (dump_enabled_p ())
9816         dump_printf_loc (MSG_NOTE, vect_location,
9817                          "vect_model_induction_cost: inside_cost = %d, "
9818                          "prologue_cost = %d .\n", inside_cost,
9819                          prologue_cost);
9820
9821       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9822       DUMP_VECT_SCOPE ("vectorizable_induction");
9823       return true;
9824     }
9825
9826   /* Transform.  */
9827
9828   /* Compute a vector variable, initialized with the first VF values of
9829      the induction variable.  E.g., for an iv with IV_PHI='X' and
9830      evolution S, for a vector of 4 units, we want to compute:
9831      [X, X + S, X + 2*S, X + 3*S].  */
9832
9833   if (dump_enabled_p ())
9834     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9835
9836   pe = loop_preheader_edge (iv_loop);
9837   /* Find the first insertion point in the BB.  */
9838   basic_block bb = gimple_bb (phi);
9839   si = gsi_after_labels (bb);
9840
9841   /* For SLP induction we have to generate several IVs as for example
9842      with group size 3 we need
9843        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9844        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9845   if (slp_node)
9846     {
9847       /* Enforced above.  */
9848       unsigned int const_nunits = nunits.to_constant ();
9849
9850       /* The initial values are vectorized, but any lanes > group_size
9851          need adjustment.  */
9852       slp_tree init_node
9853         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9854
9855       /* Gather steps.  Since we do not vectorize inductions as
9856          cycles we have to reconstruct the step from SCEV data.  */
9857       unsigned group_size = SLP_TREE_LANES (slp_node);
9858       tree *steps = XALLOCAVEC (tree, group_size);
9859       tree *inits = XALLOCAVEC (tree, group_size);
9860       stmt_vec_info phi_info;
9861       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9862         {
9863           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9864           if (!init_node)
9865             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9866                                            pe->dest_idx);
9867         }
9868
9869       /* Now generate the IVs.  */
9870       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9871       gcc_assert ((const_nunits * nvects) % group_size == 0);
9872       unsigned nivs;
9873       if (nested_in_vect_loop)
9874         nivs = nvects;
9875       else
9876         {
9877           /* Compute the number of distinct IVs we need.  First reduce
9878              group_size if it is a multiple of const_nunits so we get
9879              one IV for a group_size of 4 but const_nunits 2.  */
9880           unsigned group_sizep = group_size;
9881           if (group_sizep % const_nunits == 0)
9882             group_sizep = group_sizep / const_nunits;
9883           nivs = least_common_multiple (group_sizep,
9884                                         const_nunits) / const_nunits;
9885         }
9886       tree stept = TREE_TYPE (step_vectype);
9887       tree lupdate_mul = NULL_TREE;
9888       if (!nested_in_vect_loop)
9889         {
9890           /* The number of iterations covered in one vector iteration.  */
9891           unsigned lup_mul = (nvects * const_nunits) / group_size;
9892           lupdate_mul
9893             = build_vector_from_val (step_vectype,
9894                                      SCALAR_FLOAT_TYPE_P (stept)
9895                                      ? build_real_from_wide (stept, lup_mul,
9896                                                              UNSIGNED)
9897                                      : build_int_cstu (stept, lup_mul));
9898         }
9899       tree peel_mul = NULL_TREE;
9900       gimple_seq init_stmts = NULL;
9901       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9902         {
9903           if (SCALAR_FLOAT_TYPE_P (stept))
9904             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9905                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9906           else
9907             peel_mul = gimple_convert (&init_stmts, stept,
9908                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9909           peel_mul = gimple_build_vector_from_val (&init_stmts,
9910                                                    step_vectype, peel_mul);
9911         }
9912       unsigned ivn;
9913       auto_vec<tree> vec_steps;
9914       for (ivn = 0; ivn < nivs; ++ivn)
9915         {
9916           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9917           tree_vector_builder init_elts (vectype, const_nunits, 1);
9918           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9919           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9920             {
9921               /* The scalar steps of the IVs.  */
9922               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9923               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9924               step_elts.quick_push (elt);
9925               if (!init_node)
9926                 {
9927                   /* The scalar inits of the IVs if not vectorized.  */
9928                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9929                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9930                                                   TREE_TYPE (elt)))
9931                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9932                                         TREE_TYPE (vectype), elt);
9933                   init_elts.quick_push (elt);
9934                 }
9935               /* The number of steps to add to the initial values.  */
9936               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9937               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9938                                    ? build_real_from_wide (stept,
9939                                                            mul_elt, UNSIGNED)
9940                                    : build_int_cstu (stept, mul_elt));
9941             }
9942           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9943           vec_steps.safe_push (vec_step);
9944           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9945           if (peel_mul)
9946             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9947                                      step_mul, peel_mul);
9948           if (!init_node)
9949             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9950
9951           /* Create the induction-phi that defines the induction-operand.  */
9952           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9953                                             "vec_iv_");
9954           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9955           induc_def = PHI_RESULT (induction_phi);
9956
9957           /* Create the iv update inside the loop  */
9958           tree up = vec_step;
9959           if (lupdate_mul)
9960             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9961                                vec_step, lupdate_mul);
9962           gimple_seq stmts = NULL;
9963           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9964           vec_def = gimple_build (&stmts,
9965                                   PLUS_EXPR, step_vectype, vec_def, up);
9966           vec_def = gimple_convert (&stmts, vectype, vec_def);
9967           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9968           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9969                        UNKNOWN_LOCATION);
9970
9971           if (init_node)
9972             vec_init = vect_get_slp_vect_def (init_node, ivn);
9973           if (!nested_in_vect_loop
9974               && !integer_zerop (step_mul))
9975             {
9976               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9977               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9978                                  vec_step, step_mul);
9979               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9980                                       vec_def, up);
9981               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9982             }
9983
9984           /* Set the arguments of the phi node:  */
9985           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9986
9987           slp_node->push_vec_def (induction_phi);
9988         }
9989       if (!nested_in_vect_loop)
9990         {
9991           /* Fill up to the number of vectors we need for the whole group.  */
9992           nivs = least_common_multiple (group_size,
9993                                         const_nunits) / const_nunits;
9994           vec_steps.reserve (nivs-ivn);
9995           for (; ivn < nivs; ++ivn)
9996             {
9997               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9998               vec_steps.quick_push (vec_steps[0]);
9999             }
10000         }
10001
10002       /* Re-use IVs when we can.  We are generating further vector
10003          stmts by adding VF' * stride to the IVs generated above.  */
10004       if (ivn < nvects)
10005         {
10006           unsigned vfp
10007             = least_common_multiple (group_size, const_nunits) / group_size;
10008           tree lupdate_mul
10009             = build_vector_from_val (step_vectype,
10010                                      SCALAR_FLOAT_TYPE_P (stept)
10011                                      ? build_real_from_wide (stept,
10012                                                              vfp, UNSIGNED)
10013                                      : build_int_cstu (stept, vfp));
10014           for (; ivn < nvects; ++ivn)
10015             {
10016               gimple *iv
10017                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10018               tree def = gimple_get_lhs (iv);
10019               if (ivn < 2*nivs)
10020                 vec_steps[ivn - nivs]
10021                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10022                                   vec_steps[ivn - nivs], lupdate_mul);
10023               gimple_seq stmts = NULL;
10024               def = gimple_convert (&stmts, step_vectype, def);
10025               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10026                                   def, vec_steps[ivn % nivs]);
10027               def = gimple_convert (&stmts, vectype, def);
10028               if (gimple_code (iv) == GIMPLE_PHI)
10029                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10030               else
10031                 {
10032                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10033                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10034                 }
10035               slp_node->push_vec_def (def);
10036             }
10037         }
10038
10039       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10040       gcc_assert (!new_bb);
10041
10042       return true;
10043     }
10044
10045   init_expr = vect_phi_initial_value (phi);
10046
10047   gimple_seq stmts = NULL;
10048   if (!nested_in_vect_loop)
10049     {
10050       /* Convert the initial value to the IV update type.  */
10051       tree new_type = TREE_TYPE (step_expr);
10052       init_expr = gimple_convert (&stmts, new_type, init_expr);
10053
10054       /* If we are using the loop mask to "peel" for alignment then we need
10055          to adjust the start value here.  */
10056       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10057       if (skip_niters != NULL_TREE)
10058         {
10059           if (FLOAT_TYPE_P (vectype))
10060             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10061                                         skip_niters);
10062           else
10063             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10064           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10065                                          skip_niters, step_expr);
10066           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10067                                     init_expr, skip_step);
10068         }
10069     }
10070
10071   if (stmts)
10072     {
10073       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10074       gcc_assert (!new_bb);
10075     }
10076
10077   /* Create the vector that holds the initial_value of the induction.  */
10078   if (nested_in_vect_loop)
10079     {
10080       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10081          been created during vectorization of previous stmts.  We obtain it
10082          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10083       auto_vec<tree> vec_inits;
10084       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10085                                      init_expr, &vec_inits);
10086       vec_init = vec_inits[0];
10087       /* If the initial value is not of proper type, convert it.  */
10088       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10089         {
10090           new_stmt
10091             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10092                                                           vect_simple_var,
10093                                                           "vec_iv_"),
10094                                    VIEW_CONVERT_EXPR,
10095                                    build1 (VIEW_CONVERT_EXPR, vectype,
10096                                            vec_init));
10097           vec_init = gimple_assign_lhs (new_stmt);
10098           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10099                                                  new_stmt);
10100           gcc_assert (!new_bb);
10101         }
10102     }
10103   else
10104     {
10105       /* iv_loop is the loop to be vectorized. Create:
10106          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10107       stmts = NULL;
10108       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10109
10110       unsigned HOST_WIDE_INT const_nunits;
10111       if (nunits.is_constant (&const_nunits))
10112         {
10113           tree_vector_builder elts (step_vectype, const_nunits, 1);
10114           elts.quick_push (new_name);
10115           for (i = 1; i < const_nunits; i++)
10116             {
10117               /* Create: new_name_i = new_name + step_expr  */
10118               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10119                                        new_name, step_expr);
10120               elts.quick_push (new_name);
10121             }
10122           /* Create a vector from [new_name_0, new_name_1, ...,
10123              new_name_nunits-1]  */
10124           vec_init = gimple_build_vector (&stmts, &elts);
10125         }
10126       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10127         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10128         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10129                                  new_name, step_expr);
10130       else
10131         {
10132           /* Build:
10133                 [base, base, base, ...]
10134                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10135           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10136           gcc_assert (flag_associative_math);
10137           tree index = build_index_vector (step_vectype, 0, 1);
10138           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10139                                                         new_name);
10140           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10141                                                         step_expr);
10142           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10143           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10144                                    vec_init, step_vec);
10145           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10146                                    vec_init, base_vec);
10147         }
10148       vec_init = gimple_convert (&stmts, vectype, vec_init);
10149
10150       if (stmts)
10151         {
10152           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10153           gcc_assert (!new_bb);
10154         }
10155     }
10156
10157
10158   /* Create the vector that holds the step of the induction.  */
10159   if (nested_in_vect_loop)
10160     /* iv_loop is nested in the loop to be vectorized. Generate:
10161        vec_step = [S, S, S, S]  */
10162     new_name = step_expr;
10163   else
10164     {
10165       /* iv_loop is the loop to be vectorized. Generate:
10166           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10167       gimple_seq seq = NULL;
10168       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10169         {
10170           expr = build_int_cst (integer_type_node, vf);
10171           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10172         }
10173       else
10174         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10175       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10176                                expr, step_expr);
10177       if (seq)
10178         {
10179           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10180           gcc_assert (!new_bb);
10181         }
10182     }
10183
10184   t = unshare_expr (new_name);
10185   gcc_assert (CONSTANT_CLASS_P (new_name)
10186               || TREE_CODE (new_name) == SSA_NAME);
10187   new_vec = build_vector_from_val (step_vectype, t);
10188   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10189                                new_vec, step_vectype, NULL);
10190
10191
10192   /* Create the following def-use cycle:
10193      loop prolog:
10194          vec_init = ...
10195          vec_step = ...
10196      loop:
10197          vec_iv = PHI <vec_init, vec_loop>
10198          ...
10199          STMT
10200          ...
10201          vec_loop = vec_iv + vec_step;  */
10202
10203   /* Create the induction-phi that defines the induction-operand.  */
10204   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10205   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10206   induc_def = PHI_RESULT (induction_phi);
10207
10208   /* Create the iv update inside the loop  */
10209   stmts = NULL;
10210   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10211   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10212   vec_def = gimple_convert (&stmts, vectype, vec_def);
10213   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10215
10216   /* Set the arguments of the phi node:  */
10217   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10218   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10219                UNKNOWN_LOCATION);
10220
10221   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10222   *vec_stmt = induction_phi;
10223
10224   /* In case that vectorization factor (VF) is bigger than the number
10225      of elements that we can fit in a vectype (nunits), we have to generate
10226      more than one vector stmt - i.e - we need to "unroll" the
10227      vector stmt by a factor VF/nunits.  For more details see documentation
10228      in vectorizable_operation.  */
10229
10230   if (ncopies > 1)
10231     {
10232       gimple_seq seq = NULL;
10233       /* FORNOW. This restriction should be relaxed.  */
10234       gcc_assert (!nested_in_vect_loop);
10235
10236       /* Create the vector that holds the step of the induction.  */
10237       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10238         {
10239           expr = build_int_cst (integer_type_node, nunits);
10240           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10241         }
10242       else
10243         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10244       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10245                                expr, step_expr);
10246       if (seq)
10247         {
10248           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10249           gcc_assert (!new_bb);
10250         }
10251
10252       t = unshare_expr (new_name);
10253       gcc_assert (CONSTANT_CLASS_P (new_name)
10254                   || TREE_CODE (new_name) == SSA_NAME);
10255       new_vec = build_vector_from_val (step_vectype, t);
10256       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10257                                    new_vec, step_vectype, NULL);
10258
10259       vec_def = induc_def;
10260       for (i = 1; i < ncopies + 1; i++)
10261         {
10262           /* vec_i = vec_prev + vec_step  */
10263           gimple_seq stmts = NULL;
10264           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10265           vec_def = gimple_build (&stmts,
10266                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10267           vec_def = gimple_convert (&stmts, vectype, vec_def);
10268
10269           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10270           if (i < ncopies)
10271             {
10272               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10273               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10274             }
10275           else
10276             {
10277               /* vec_1 = vec_iv + (VF/n * S)
10278                  vec_2 = vec_1 + (VF/n * S)
10279                  ...
10280                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10281
10282                  vec_n is used as vec_loop to save the large step register and
10283                  related operations.  */
10284               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10285                            UNKNOWN_LOCATION);
10286             }
10287         }
10288     }
10289
10290   if (dump_enabled_p ())
10291     dump_printf_loc (MSG_NOTE, vect_location,
10292                      "transform induction: created def-use cycle: %G%G",
10293                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10294
10295   return true;
10296 }
10297
10298 /* Function vectorizable_live_operation.
10299
10300    STMT_INFO computes a value that is used outside the loop.  Check if
10301    it can be supported.  */
10302
10303 bool
10304 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10305                              slp_tree slp_node, slp_instance slp_node_instance,
10306                              int slp_index, bool vec_stmt_p,
10307                              stmt_vector_for_cost *cost_vec)
10308 {
10309   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10310   imm_use_iterator imm_iter;
10311   tree lhs, lhs_type, bitsize;
10312   tree vectype = (slp_node
10313                   ? SLP_TREE_VECTYPE (slp_node)
10314                   : STMT_VINFO_VECTYPE (stmt_info));
10315   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10316   int ncopies;
10317   gimple *use_stmt;
10318   auto_vec<tree> vec_oprnds;
10319   int vec_entry = 0;
10320   poly_uint64 vec_index = 0;
10321
10322   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10323
10324   /* If a stmt of a reduction is live, vectorize it via
10325      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10326      validity so just trigger the transform here.  */
10327   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10328     {
10329       if (!vec_stmt_p)
10330         return true;
10331       if (slp_node)
10332         {
10333           /* For reduction chains the meta-info is attached to
10334              the group leader.  */
10335           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10336             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10337           /* For SLP reductions we vectorize the epilogue for
10338              all involved stmts together.  */
10339           else if (slp_index != 0)
10340             return true;
10341         }
10342       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10343       gcc_assert (reduc_info->is_reduc_info);
10344       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10345           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10346         return true;
10347       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10348                                         slp_node_instance);
10349       return true;
10350     }
10351
10352   /* If STMT is not relevant and it is a simple assignment and its inputs are
10353      invariant then it can remain in place, unvectorized.  The original last
10354      scalar value that it computes will be used.  */
10355   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10356     {
10357       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10358       if (dump_enabled_p ())
10359         dump_printf_loc (MSG_NOTE, vect_location,
10360                          "statement is simple and uses invariant.  Leaving in "
10361                          "place.\n");
10362       return true;
10363     }
10364
10365   if (slp_node)
10366     ncopies = 1;
10367   else
10368     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10369
10370   if (slp_node)
10371     {
10372       gcc_assert (slp_index >= 0);
10373
10374       /* Get the last occurrence of the scalar index from the concatenation of
10375          all the slp vectors. Calculate which slp vector it is and the index
10376          within.  */
10377       int num_scalar = SLP_TREE_LANES (slp_node);
10378       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10379       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10380
10381       /* Calculate which vector contains the result, and which lane of
10382          that vector we need.  */
10383       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10384         {
10385           if (dump_enabled_p ())
10386             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10387                              "Cannot determine which vector holds the"
10388                              " final result.\n");
10389           return false;
10390         }
10391     }
10392
10393   if (!vec_stmt_p)
10394     {
10395       /* No transformation required.  */
10396       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10397         {
10398           if (slp_node)
10399             {
10400               if (dump_enabled_p ())
10401                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10402                                  "can't operate on partial vectors "
10403                                  "because an SLP statement is live after "
10404                                  "the loop.\n");
10405               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10406             }
10407           else if (ncopies > 1)
10408             {
10409               if (dump_enabled_p ())
10410                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10411                                  "can't operate on partial vectors "
10412                                  "because ncopies is greater than 1.\n");
10413               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10414             }
10415           else
10416             {
10417               gcc_assert (ncopies == 1 && !slp_node);
10418               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10419                                                   OPTIMIZE_FOR_SPEED))
10420                 vect_record_loop_mask (loop_vinfo,
10421                                        &LOOP_VINFO_MASKS (loop_vinfo),
10422                                        1, vectype, NULL);
10423               else if (can_vec_extract_var_idx_p (
10424                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10425                 vect_record_loop_len (loop_vinfo,
10426                                       &LOOP_VINFO_LENS (loop_vinfo),
10427                                       1, vectype, 1);
10428               else
10429                 {
10430                   if (dump_enabled_p ())
10431                     dump_printf_loc (
10432                       MSG_MISSED_OPTIMIZATION, vect_location,
10433                       "can't operate on partial vectors "
10434                       "because the target doesn't support extract "
10435                       "last reduction.\n");
10436                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10437                 }
10438             }
10439         }
10440       /* ???  Enable for loop costing as well.  */
10441       if (!loop_vinfo)
10442         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10443                           0, vect_epilogue);
10444       return true;
10445     }
10446
10447   /* Use the lhs of the original scalar statement.  */
10448   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10449   if (dump_enabled_p ())
10450     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10451                      "stmt %G", stmt);
10452
10453   lhs = gimple_get_lhs (stmt);
10454   lhs_type = TREE_TYPE (lhs);
10455
10456   bitsize = vector_element_bits_tree (vectype);
10457
10458   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10459   tree vec_lhs, bitstart;
10460   gimple *vec_stmt;
10461   if (slp_node)
10462     {
10463       gcc_assert (!loop_vinfo
10464                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10465                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10466
10467       /* Get the correct slp vectorized stmt.  */
10468       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10469       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10470
10471       /* Get entry to use.  */
10472       bitstart = bitsize_int (vec_index);
10473       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10474     }
10475   else
10476     {
10477       /* For multiple copies, get the last copy.  */
10478       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10479       vec_lhs = gimple_get_lhs (vec_stmt);
10480
10481       /* Get the last lane in the vector.  */
10482       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10483     }
10484
10485   if (loop_vinfo)
10486     {
10487       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10488          requirement, insert one phi node for it.  It looks like:
10489            loop;
10490          BB:
10491            # lhs' = PHI <lhs>
10492          ==>
10493            loop;
10494          BB:
10495            # vec_lhs' = PHI <vec_lhs>
10496            new_tree = lane_extract <vec_lhs', ...>;
10497            lhs' = new_tree;  */
10498
10499       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10500       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10501       gcc_assert (single_pred_p (exit_bb));
10502
10503       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10504       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10505       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10506
10507       gimple_seq stmts = NULL;
10508       tree new_tree;
10509       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10510         {
10511           /* Emit:
10512
10513                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10514
10515              where VEC_LHS is the vectorized live-out result and MASK is
10516              the loop mask for the final iteration.  */
10517           gcc_assert (ncopies == 1 && !slp_node);
10518           gimple_seq tem = NULL;
10519           gimple_stmt_iterator gsi = gsi_last (tem);
10520           tree len
10521             = vect_get_loop_len (loop_vinfo, &gsi,
10522                                  &LOOP_VINFO_LENS (loop_vinfo),
10523                                  1, vectype, 0, 0);
10524
10525           /* BIAS - 1.  */
10526           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10527           tree bias_minus_one
10528             = int_const_binop (MINUS_EXPR,
10529                                build_int_cst (TREE_TYPE (len), biasval),
10530                                build_one_cst (TREE_TYPE (len)));
10531
10532           /* LAST_INDEX = LEN + (BIAS - 1).  */
10533           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10534                                           len, bias_minus_one);
10535
10536           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10537           tree scalar_res
10538             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10539                             vec_lhs_phi, last_index);
10540
10541           /* Convert the extracted vector element to the scalar type.  */
10542           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10543         }
10544       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10545         {
10546           /* Emit:
10547
10548                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10549
10550              where VEC_LHS is the vectorized live-out result and MASK is
10551              the loop mask for the final iteration.  */
10552           gcc_assert (ncopies == 1 && !slp_node);
10553           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10554           gimple_seq tem = NULL;
10555           gimple_stmt_iterator gsi = gsi_last (tem);
10556           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10557                                           &LOOP_VINFO_MASKS (loop_vinfo),
10558                                           1, vectype, 0);
10559           gimple_seq_add_seq (&stmts, tem);
10560           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10561                                           mask, vec_lhs_phi);
10562
10563           /* Convert the extracted vector element to the scalar type.  */
10564           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10565         }
10566       else
10567         {
10568           tree bftype = TREE_TYPE (vectype);
10569           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10570             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10571           new_tree = build3 (BIT_FIELD_REF, bftype,
10572                              vec_lhs_phi, bitsize, bitstart);
10573           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10574                                            &stmts, true, NULL_TREE);
10575         }
10576
10577       if (stmts)
10578         {
10579           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10580           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10581
10582           /* Remove existing phi from lhs and create one copy from new_tree.  */
10583           tree lhs_phi = NULL_TREE;
10584           gimple_stmt_iterator gsi;
10585           for (gsi = gsi_start_phis (exit_bb);
10586                !gsi_end_p (gsi); gsi_next (&gsi))
10587             {
10588               gimple *phi = gsi_stmt (gsi);
10589               if ((gimple_phi_arg_def (phi, 0) == lhs))
10590                 {
10591                   remove_phi_node (&gsi, false);
10592                   lhs_phi = gimple_phi_result (phi);
10593                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10594                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10595                   break;
10596                 }
10597             }
10598         }
10599
10600       /* Replace use of lhs with newly computed result.  If the use stmt is a
10601          single arg PHI, just replace all uses of PHI result.  It's necessary
10602          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10603       use_operand_p use_p;
10604       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10605         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10606             && !is_gimple_debug (use_stmt))
10607           {
10608             if (gimple_code (use_stmt) == GIMPLE_PHI
10609                 && gimple_phi_num_args (use_stmt) == 1)
10610               {
10611                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10612               }
10613             else
10614               {
10615                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10616                     SET_USE (use_p, new_tree);
10617               }
10618             update_stmt (use_stmt);
10619           }
10620     }
10621   else
10622     {
10623       /* For basic-block vectorization simply insert the lane-extraction.  */
10624       tree bftype = TREE_TYPE (vectype);
10625       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10626         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10627       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10628                               vec_lhs, bitsize, bitstart);
10629       gimple_seq stmts = NULL;
10630       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10631                                        &stmts, true, NULL_TREE);
10632       if (TREE_CODE (new_tree) == SSA_NAME
10633           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10634         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10635       if (is_a <gphi *> (vec_stmt))
10636         {
10637           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10638           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10639         }
10640       else
10641         {
10642           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10643           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10644         }
10645
10646       /* Replace use of lhs with newly computed result.  If the use stmt is a
10647          single arg PHI, just replace all uses of PHI result.  It's necessary
10648          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10649       use_operand_p use_p;
10650       stmt_vec_info use_stmt_info;
10651       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10652         if (!is_gimple_debug (use_stmt)
10653             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10654                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10655           {
10656             /* ???  This can happen when the live lane ends up being
10657                used in a vector construction code-generated by an
10658                external SLP node (and code-generation for that already
10659                happened).  See gcc.dg/vect/bb-slp-47.c.
10660                Doing this is what would happen if that vector CTOR
10661                were not code-generated yet so it is not too bad.
10662                ???  In fact we'd likely want to avoid this situation
10663                in the first place.  */
10664             if (TREE_CODE (new_tree) == SSA_NAME
10665                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10666                 && gimple_code (use_stmt) != GIMPLE_PHI
10667                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10668                                                 use_stmt))
10669               {
10670                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10671                 gcc_checking_assert (code == SSA_NAME
10672                                      || code == CONSTRUCTOR
10673                                      || code == VIEW_CONVERT_EXPR
10674                                      || CONVERT_EXPR_CODE_P (code));
10675                 if (dump_enabled_p ())
10676                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10677                                    "Using original scalar computation for "
10678                                    "live lane because use preceeds vector "
10679                                    "def\n");
10680                 continue;
10681               }
10682             /* ???  It can also happen that we end up pulling a def into
10683                a loop where replacing out-of-loop uses would require
10684                a new LC SSA PHI node.  Retain the original scalar in
10685                those cases as well.  PR98064.  */
10686             if (TREE_CODE (new_tree) == SSA_NAME
10687                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10688                 && (gimple_bb (use_stmt)->loop_father
10689                     != gimple_bb (vec_stmt)->loop_father)
10690                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10691                                         gimple_bb (use_stmt)->loop_father))
10692               {
10693                 if (dump_enabled_p ())
10694                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10695                                    "Using original scalar computation for "
10696                                    "live lane because there is an out-of-loop "
10697                                    "definition for it\n");
10698                 continue;
10699               }
10700             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10701               SET_USE (use_p, new_tree);
10702             update_stmt (use_stmt);
10703           }
10704     }
10705
10706   return true;
10707 }
10708
10709 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10710
10711 static void
10712 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10713 {
10714   ssa_op_iter op_iter;
10715   imm_use_iterator imm_iter;
10716   def_operand_p def_p;
10717   gimple *ustmt;
10718
10719   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10720     {
10721       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10722         {
10723           basic_block bb;
10724
10725           if (!is_gimple_debug (ustmt))
10726             continue;
10727
10728           bb = gimple_bb (ustmt);
10729
10730           if (!flow_bb_inside_loop_p (loop, bb))
10731             {
10732               if (gimple_debug_bind_p (ustmt))
10733                 {
10734                   if (dump_enabled_p ())
10735                     dump_printf_loc (MSG_NOTE, vect_location,
10736                                      "killing debug use\n");
10737
10738                   gimple_debug_bind_reset_value (ustmt);
10739                   update_stmt (ustmt);
10740                 }
10741               else
10742                 gcc_unreachable ();
10743             }
10744         }
10745     }
10746 }
10747
10748 /* Given loop represented by LOOP_VINFO, return true if computation of
10749    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10750    otherwise.  */
10751
10752 static bool
10753 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10754 {
10755   /* Constant case.  */
10756   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10757     {
10758       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10759       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10760
10761       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10762       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10763       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10764         return true;
10765     }
10766
10767   widest_int max;
10768   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10769   /* Check the upper bound of loop niters.  */
10770   if (get_max_loop_iterations (loop, &max))
10771     {
10772       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10773       signop sgn = TYPE_SIGN (type);
10774       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10775       if (max < type_max)
10776         return true;
10777     }
10778   return false;
10779 }
10780
10781 /* Return a mask type with half the number of elements as OLD_TYPE,
10782    given that it should have mode NEW_MODE.  */
10783
10784 tree
10785 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10786 {
10787   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10788   return build_truth_vector_type_for_mode (nunits, new_mode);
10789 }
10790
10791 /* Return a mask type with twice as many elements as OLD_TYPE,
10792    given that it should have mode NEW_MODE.  */
10793
10794 tree
10795 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10796 {
10797   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10798   return build_truth_vector_type_for_mode (nunits, new_mode);
10799 }
10800
10801 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10802    contain a sequence of NVECTORS masks that each control a vector of type
10803    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10804    these vector masks with the vector version of SCALAR_MASK.  */
10805
10806 void
10807 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10808                        unsigned int nvectors, tree vectype, tree scalar_mask)
10809 {
10810   gcc_assert (nvectors != 0);
10811
10812   if (scalar_mask)
10813     {
10814       scalar_cond_masked_key cond (scalar_mask, nvectors);
10815       loop_vinfo->scalar_cond_masked_set.add (cond);
10816     }
10817
10818   masks->mask_set.add (std::make_pair (vectype, nvectors));
10819 }
10820
10821 /* Given a complete set of masks MASKS, extract mask number INDEX
10822    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10823    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10824
10825    See the comment above vec_loop_masks for more details about the mask
10826    arrangement.  */
10827
10828 tree
10829 vect_get_loop_mask (loop_vec_info loop_vinfo,
10830                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10831                     unsigned int nvectors, tree vectype, unsigned int index)
10832 {
10833   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10834       == vect_partial_vectors_while_ult)
10835     {
10836       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10837       tree mask_type = rgm->type;
10838
10839       /* Populate the rgroup's mask array, if this is the first time we've
10840          used it.  */
10841       if (rgm->controls.is_empty ())
10842         {
10843           rgm->controls.safe_grow_cleared (nvectors, true);
10844           for (unsigned int i = 0; i < nvectors; ++i)
10845             {
10846               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10847               /* Provide a dummy definition until the real one is available.  */
10848               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10849               rgm->controls[i] = mask;
10850             }
10851         }
10852
10853       tree mask = rgm->controls[index];
10854       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10855                     TYPE_VECTOR_SUBPARTS (vectype)))
10856         {
10857           /* A loop mask for data type X can be reused for data type Y
10858              if X has N times more elements than Y and if Y's elements
10859              are N times bigger than X's.  In this case each sequence
10860              of N elements in the loop mask will be all-zero or all-one.
10861              We can then view-convert the mask so that each sequence of
10862              N elements is replaced by a single element.  */
10863           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10864                                   TYPE_VECTOR_SUBPARTS (vectype)));
10865           gimple_seq seq = NULL;
10866           mask_type = truth_type_for (vectype);
10867           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10868           if (seq)
10869             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10870         }
10871       return mask;
10872     }
10873   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10874            == vect_partial_vectors_avx512)
10875     {
10876       /* The number of scalars per iteration and the number of vectors are
10877          both compile-time constants.  */
10878       unsigned int nscalars_per_iter
10879         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10880                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10881
10882       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10883
10884       /* The stored nV is dependent on the mask type produced.  */
10885       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10886                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10887                   == rgm->factor);
10888       nvectors = rgm->factor;
10889
10890       /* Populate the rgroup's mask array, if this is the first time we've
10891          used it.  */
10892       if (rgm->controls.is_empty ())
10893         {
10894           rgm->controls.safe_grow_cleared (nvectors, true);
10895           for (unsigned int i = 0; i < nvectors; ++i)
10896             {
10897               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10898               /* Provide a dummy definition until the real one is available.  */
10899               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10900               rgm->controls[i] = mask;
10901             }
10902         }
10903       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10904                     TYPE_VECTOR_SUBPARTS (vectype)))
10905         return rgm->controls[index];
10906
10907       /* Split the vector if needed.  Since we are dealing with integer mode
10908          masks with AVX512 we can operate on the integer representation
10909          performing the whole vector shifting.  */
10910       unsigned HOST_WIDE_INT factor;
10911       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10912                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10913       gcc_assert (ok);
10914       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10915       tree mask_type = truth_type_for (vectype);
10916       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10917       unsigned vi = index / factor;
10918       unsigned vpart = index % factor;
10919       tree vec = rgm->controls[vi];
10920       gimple_seq seq = NULL;
10921       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10922                           lang_hooks.types.type_for_mode
10923                                 (TYPE_MODE (rgm->type), 1), vec);
10924       /* For integer mode masks simply shift the right bits into position.  */
10925       if (vpart != 0)
10926         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10927                             build_int_cst (integer_type_node,
10928                                            (TYPE_VECTOR_SUBPARTS (vectype)
10929                                             * vpart)));
10930       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10931                                     (TYPE_MODE (mask_type), 1), vec);
10932       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10933       if (seq)
10934         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10935       return vec;
10936     }
10937   else
10938     gcc_unreachable ();
10939 }
10940
10941 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10942    lengths for controlling an operation on VECTYPE.  The operation splits
10943    each element of VECTYPE into FACTOR separate subelements, measuring the
10944    length as a number of these subelements.  */
10945
10946 void
10947 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10948                       unsigned int nvectors, tree vectype, unsigned int factor)
10949 {
10950   gcc_assert (nvectors != 0);
10951   if (lens->length () < nvectors)
10952     lens->safe_grow_cleared (nvectors, true);
10953   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10954
10955   /* The number of scalars per iteration, scalar occupied bytes and
10956      the number of vectors are both compile-time constants.  */
10957   unsigned int nscalars_per_iter
10958     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10959                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10960
10961   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10962     {
10963       /* For now, we only support cases in which all loads and stores fall back
10964          to VnQI or none do.  */
10965       gcc_assert (!rgl->max_nscalars_per_iter
10966                   || (rgl->factor == 1 && factor == 1)
10967                   || (rgl->max_nscalars_per_iter * rgl->factor
10968                       == nscalars_per_iter * factor));
10969       rgl->max_nscalars_per_iter = nscalars_per_iter;
10970       rgl->type = vectype;
10971       rgl->factor = factor;
10972     }
10973 }
10974
10975 /* Given a complete set of lengths LENS, extract length number INDEX
10976    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10977    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10978    multipled by the number of elements that should be processed.
10979    Insert any set-up statements before GSI.  */
10980
10981 tree
10982 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10983                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10984                    unsigned int index, unsigned int factor)
10985 {
10986   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10987   bool use_bias_adjusted_len =
10988     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10989
10990   /* Populate the rgroup's len array, if this is the first time we've
10991      used it.  */
10992   if (rgl->controls.is_empty ())
10993     {
10994       rgl->controls.safe_grow_cleared (nvectors, true);
10995       for (unsigned int i = 0; i < nvectors; ++i)
10996         {
10997           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10998           gcc_assert (len_type != NULL_TREE);
10999
11000           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11001
11002           /* Provide a dummy definition until the real one is available.  */
11003           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11004           rgl->controls[i] = len;
11005
11006           if (use_bias_adjusted_len)
11007             {
11008               gcc_assert (i == 0);
11009               tree adjusted_len =
11010                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11011               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11012               rgl->bias_adjusted_ctrl = adjusted_len;
11013             }
11014         }
11015     }
11016
11017   if (use_bias_adjusted_len)
11018     return rgl->bias_adjusted_ctrl;
11019
11020   tree loop_len = rgl->controls[index];
11021   if (rgl->factor == 1 && factor == 1)
11022     {
11023       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11024       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11025       if (maybe_ne (nunits1, nunits2))
11026         {
11027           /* A loop len for data type X can be reused for data type Y
11028              if X has N times more elements than Y and if Y's elements
11029              are N times bigger than X's.  */
11030           gcc_assert (multiple_p (nunits1, nunits2));
11031           factor = exact_div (nunits1, nunits2).to_constant ();
11032           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11033           gimple_seq seq = NULL;
11034           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11035                                    build_int_cst (iv_type, factor));
11036           if (seq)
11037             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11038         }
11039     }
11040   return loop_len;
11041 }
11042
11043 /* Scale profiling counters by estimation for LOOP which is vectorized
11044    by factor VF.
11045    If FLAT is true, the loop we started with had unrealistically flat
11046    profile.  */
11047
11048 static void
11049 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11050 {
11051   /* For flat profiles do not scale down proportionally by VF and only
11052      cap by known iteration count bounds.  */
11053   if (flat)
11054     {
11055       if (dump_file && (dump_flags & TDF_DETAILS))
11056         fprintf (dump_file,
11057                  "Vectorized loop profile seems flat; not scaling iteration "
11058                  "count down by the vectorization factor %i\n", vf);
11059       scale_loop_profile (loop, profile_probability::always (),
11060                           get_likely_max_loop_iterations_int (loop));
11061       return;
11062     }
11063   /* Loop body executes VF fewer times and exit increases VF times.  */
11064   profile_count entry_count = loop_preheader_edge (loop)->count ();
11065
11066   /* If we have unreliable loop profile avoid dropping entry
11067      count bellow header count.  This can happen since loops
11068      has unrealistically low trip counts.  */
11069   while (vf > 1
11070          && loop->header->count > entry_count
11071          && loop->header->count < entry_count * vf)
11072     {
11073       if (dump_file && (dump_flags & TDF_DETAILS))
11074         fprintf (dump_file,
11075                  "Vectorization factor %i seems too large for profile "
11076                  "prevoiusly believed to be consistent; reducing.\n", vf);
11077       vf /= 2;
11078     }
11079
11080   if (entry_count.nonzero_p ())
11081     set_edge_probability_and_rescale_others
11082             (exit_e,
11083              entry_count.probability_in (loop->header->count / vf));
11084   /* Avoid producing very large exit probability when we do not have
11085      sensible profile.  */
11086   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11087     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11088   loop->latch->count = single_pred_edge (loop->latch)->count ();
11089
11090   scale_loop_profile (loop, profile_probability::always () / vf,
11091                       get_likely_max_loop_iterations_int (loop));
11092 }
11093
11094 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11095    latch edge values originally defined by it.  */
11096
11097 static void
11098 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11099                                      stmt_vec_info def_stmt_info)
11100 {
11101   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11102   if (!def || TREE_CODE (def) != SSA_NAME)
11103     return;
11104   stmt_vec_info phi_info;
11105   imm_use_iterator iter;
11106   use_operand_p use_p;
11107   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11108     {
11109       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11110       if (!phi)
11111         continue;
11112       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11113             && (phi_info = loop_vinfo->lookup_stmt (phi))
11114             && STMT_VINFO_RELEVANT_P (phi_info)))
11115         continue;
11116       loop_p loop = gimple_bb (phi)->loop_father;
11117       edge e = loop_latch_edge (loop);
11118       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11119         continue;
11120
11121       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11122           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11123           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11124         {
11125           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11126           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11127           gcc_assert (phi_defs.length () == latch_defs.length ());
11128           for (unsigned i = 0; i < phi_defs.length (); ++i)
11129             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11130                          gimple_get_lhs (latch_defs[i]), e,
11131                          gimple_phi_arg_location (phi, e->dest_idx));
11132         }
11133       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11134         {
11135           /* For first order recurrences we have to update both uses of
11136              the latch definition, the one in the PHI node and the one
11137              in the generated VEC_PERM_EXPR.  */
11138           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11139           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11140           gcc_assert (phi_defs.length () == latch_defs.length ());
11141           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11142           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11143           for (unsigned i = 0; i < phi_defs.length (); ++i)
11144             {
11145               gassign *perm = as_a <gassign *> (phi_defs[i]);
11146               if (i > 0)
11147                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11148               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11149               update_stmt (perm);
11150             }
11151           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11152                        gimple_phi_arg_location (phi, e->dest_idx));
11153         }
11154     }
11155 }
11156
11157 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11158    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11159    stmt_vec_info.  */
11160
11161 static bool
11162 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11163                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11164 {
11165   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11166   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11167
11168   if (dump_enabled_p ())
11169     dump_printf_loc (MSG_NOTE, vect_location,
11170                      "------>vectorizing statement: %G", stmt_info->stmt);
11171
11172   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11173     vect_loop_kill_debug_uses (loop, stmt_info);
11174
11175   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11176       && !STMT_VINFO_LIVE_P (stmt_info))
11177     return false;
11178
11179   if (STMT_VINFO_VECTYPE (stmt_info))
11180     {
11181       poly_uint64 nunits
11182         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11183       if (!STMT_SLP_TYPE (stmt_info)
11184           && maybe_ne (nunits, vf)
11185           && dump_enabled_p ())
11186         /* For SLP VF is set according to unrolling factor, and not
11187            to vector size, hence for SLP this print is not valid.  */
11188         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11189     }
11190
11191   /* Pure SLP statements have already been vectorized.  We still need
11192      to apply loop vectorization to hybrid SLP statements.  */
11193   if (PURE_SLP_STMT (stmt_info))
11194     return false;
11195
11196   if (dump_enabled_p ())
11197     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11198
11199   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11200     *seen_store = stmt_info;
11201
11202   return true;
11203 }
11204
11205 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11206    in the hash_map with its corresponding values.  */
11207
11208 static tree
11209 find_in_mapping (tree t, void *context)
11210 {
11211   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11212
11213   tree *value = mapping->get (t);
11214   return value ? *value : t;
11215 }
11216
11217 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11218    original loop that has now been vectorized.
11219
11220    The inits of the data_references need to be advanced with the number of
11221    iterations of the main loop.  This has been computed in vect_do_peeling and
11222    is stored in parameter ADVANCE.  We first restore the data_references
11223    initial offset with the values recored in ORIG_DRS_INIT.
11224
11225    Since the loop_vec_info of this EPILOGUE was constructed for the original
11226    loop, its stmt_vec_infos all point to the original statements.  These need
11227    to be updated to point to their corresponding copies as well as the SSA_NAMES
11228    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11229
11230    The data_reference's connections also need to be updated.  Their
11231    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11232    stmt_vec_infos, their statements need to point to their corresponding copy,
11233    if they are gather loads or scatter stores then their reference needs to be
11234    updated to point to its corresponding copy and finally we set
11235    'base_misaligned' to false as we have already peeled for alignment in the
11236    prologue of the main loop.  */
11237
11238 static void
11239 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11240 {
11241   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11242   auto_vec<gimple *> stmt_worklist;
11243   hash_map<tree,tree> mapping;
11244   gimple *orig_stmt, *new_stmt;
11245   gimple_stmt_iterator epilogue_gsi;
11246   gphi_iterator epilogue_phi_gsi;
11247   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11248   basic_block *epilogue_bbs = get_loop_body (epilogue);
11249   unsigned i;
11250
11251   free (LOOP_VINFO_BBS (epilogue_vinfo));
11252   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11253
11254   /* Advance data_reference's with the number of iterations of the previous
11255      loop and its prologue.  */
11256   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11257
11258
11259   /* The EPILOGUE loop is a copy of the original loop so they share the same
11260      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11261      point to the copied statements.  We also create a mapping of all LHS' in
11262      the original loop and all the LHS' in the EPILOGUE and create worklists to
11263      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11264   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11265     {
11266       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11267            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11268         {
11269           new_stmt = epilogue_phi_gsi.phi ();
11270
11271           gcc_assert (gimple_uid (new_stmt) > 0);
11272           stmt_vinfo
11273             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11274
11275           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11276           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11277
11278           mapping.put (gimple_phi_result (orig_stmt),
11279                        gimple_phi_result (new_stmt));
11280           /* PHI nodes can not have patterns or related statements.  */
11281           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11282                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11283         }
11284
11285       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11286            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11287         {
11288           new_stmt = gsi_stmt (epilogue_gsi);
11289           if (is_gimple_debug (new_stmt))
11290             continue;
11291
11292           gcc_assert (gimple_uid (new_stmt) > 0);
11293           stmt_vinfo
11294             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11295
11296           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11297           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11298
11299           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11300             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11301
11302           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11303             {
11304               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11305               for (gimple_stmt_iterator gsi = gsi_start (seq);
11306                    !gsi_end_p (gsi); gsi_next (&gsi))
11307                 stmt_worklist.safe_push (gsi_stmt (gsi));
11308             }
11309
11310           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11311           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11312             {
11313               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11314               stmt_worklist.safe_push (stmt);
11315               /* Set BB such that the assert in
11316                 'get_initial_def_for_reduction' is able to determine that
11317                 the BB of the related stmt is inside this loop.  */
11318               gimple_set_bb (stmt,
11319                              gimple_bb (new_stmt));
11320               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11321               gcc_assert (related_vinfo == NULL
11322                           || related_vinfo == stmt_vinfo);
11323             }
11324         }
11325     }
11326
11327   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11328      using the original main loop and thus need to be updated to refer to the
11329      cloned variables used in the epilogue.  */
11330   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11331     {
11332       gimple *stmt = stmt_worklist[i];
11333       tree *new_op;
11334
11335       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11336         {
11337           tree op = gimple_op (stmt, j);
11338           if ((new_op = mapping.get(op)))
11339             gimple_set_op (stmt, j, *new_op);
11340           else
11341             {
11342               /* PR92429: The last argument of simplify_replace_tree disables
11343                  folding when replacing arguments.  This is required as
11344                  otherwise you might end up with different statements than the
11345                  ones analyzed in vect_loop_analyze, leading to different
11346                  vectorization.  */
11347               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11348                                           &find_in_mapping, &mapping, false);
11349               gimple_set_op (stmt, j, op);
11350             }
11351         }
11352     }
11353
11354   struct data_reference *dr;
11355   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11356   FOR_EACH_VEC_ELT (datarefs, i, dr)
11357     {
11358       orig_stmt = DR_STMT (dr);
11359       gcc_assert (gimple_uid (orig_stmt) > 0);
11360       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11361       /* Data references for gather loads and scatter stores do not use the
11362          updated offset we set using ADVANCE.  Instead we have to make sure the
11363          reference in the data references point to the corresponding copy of
11364          the original in the epilogue.  Make sure to update both
11365          gather/scatters recognized by dataref analysis and also other
11366          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11367       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11368       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11369           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11370         {
11371           DR_REF (dr)
11372             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11373                                      &find_in_mapping, &mapping);
11374           DR_BASE_ADDRESS (dr)
11375             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11376                                      &find_in_mapping, &mapping);
11377         }
11378       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11379       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11380       /* The vector size of the epilogue is smaller than that of the main loop
11381          so the alignment is either the same or lower. This means the dr will
11382          thus by definition be aligned.  */
11383       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11384     }
11385
11386   epilogue_vinfo->shared->datarefs_copy.release ();
11387   epilogue_vinfo->shared->save_datarefs ();
11388 }
11389
11390 /* Function vect_transform_loop.
11391
11392    The analysis phase has determined that the loop is vectorizable.
11393    Vectorize the loop - created vectorized stmts to replace the scalar
11394    stmts in the loop, and update the loop exit condition.
11395    Returns scalar epilogue loop if any.  */
11396
11397 class loop *
11398 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11399 {
11400   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11401   class loop *epilogue = NULL;
11402   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11403   int nbbs = loop->num_nodes;
11404   int i;
11405   tree niters_vector = NULL_TREE;
11406   tree step_vector = NULL_TREE;
11407   tree niters_vector_mult_vf = NULL_TREE;
11408   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11409   unsigned int lowest_vf = constant_lower_bound (vf);
11410   gimple *stmt;
11411   bool check_profitability = false;
11412   unsigned int th;
11413   bool flat = maybe_flat_loop_profile (loop);
11414
11415   DUMP_VECT_SCOPE ("vec_transform_loop");
11416
11417   loop_vinfo->shared->check_datarefs ();
11418
11419   /* Use the more conservative vectorization threshold.  If the number
11420      of iterations is constant assume the cost check has been performed
11421      by our caller.  If the threshold makes all loops profitable that
11422      run at least the (estimated) vectorization factor number of times
11423      checking is pointless, too.  */
11424   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11425   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11426     {
11427       if (dump_enabled_p ())
11428         dump_printf_loc (MSG_NOTE, vect_location,
11429                          "Profitability threshold is %d loop iterations.\n",
11430                          th);
11431       check_profitability = true;
11432     }
11433
11434   /* Make sure there exists a single-predecessor exit bb.  Do this before
11435      versioning.   */
11436   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11437   if (! single_pred_p (e->dest))
11438     {
11439       split_loop_exit_edge (e, true);
11440       if (dump_enabled_p ())
11441         dump_printf (MSG_NOTE, "split exit edge\n");
11442     }
11443
11444   /* Version the loop first, if required, so the profitability check
11445      comes first.  */
11446
11447   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11448     {
11449       class loop *sloop
11450         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11451       sloop->force_vectorize = false;
11452       check_profitability = false;
11453     }
11454
11455   /* Make sure there exists a single-predecessor exit bb also on the
11456      scalar loop copy.  Do this after versioning but before peeling
11457      so CFG structure is fine for both scalar and if-converted loop
11458      to make slpeel_duplicate_current_defs_from_edges face matched
11459      loop closed PHI nodes on the exit.  */
11460   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11461     {
11462       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11463       if (! single_pred_p (e->dest))
11464         {
11465           split_loop_exit_edge (e, true);
11466           if (dump_enabled_p ())
11467             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11468         }
11469     }
11470
11471   tree niters = vect_build_loop_niters (loop_vinfo);
11472   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11473   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11474   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11475   tree advance;
11476   drs_init_vec orig_drs_init;
11477
11478   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11479                               &step_vector, &niters_vector_mult_vf, th,
11480                               check_profitability, niters_no_overflow,
11481                               &advance);
11482   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11483       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11484     {
11485       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11486          block after loop exit.  We need to scale all that.  */
11487       basic_block preheader
11488         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11489       preheader->count
11490         = preheader->count.apply_probability
11491               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11492       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11493                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11494       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11495         = preheader->count;
11496     }
11497
11498   if (niters_vector == NULL_TREE)
11499     {
11500       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11501           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11502           && known_eq (lowest_vf, vf))
11503         {
11504           niters_vector
11505             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11506                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11507           step_vector = build_one_cst (TREE_TYPE (niters));
11508         }
11509       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11510         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11511                                      &step_vector, niters_no_overflow);
11512       else
11513         /* vect_do_peeling subtracted the number of peeled prologue
11514            iterations from LOOP_VINFO_NITERS.  */
11515         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11516                                      &niters_vector, &step_vector,
11517                                      niters_no_overflow);
11518     }
11519
11520   /* 1) Make sure the loop header has exactly two entries
11521      2) Make sure we have a preheader basic block.  */
11522
11523   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11524
11525   split_edge (loop_preheader_edge (loop));
11526
11527   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11528     /* This will deal with any possible peeling.  */
11529     vect_prepare_for_masked_peels (loop_vinfo);
11530
11531   /* Schedule the SLP instances first, then handle loop vectorization
11532      below.  */
11533   if (!loop_vinfo->slp_instances.is_empty ())
11534     {
11535       DUMP_VECT_SCOPE ("scheduling SLP instances");
11536       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11537     }
11538
11539   /* FORNOW: the vectorizer supports only loops which body consist
11540      of one basic block (header + empty latch). When the vectorizer will
11541      support more involved loop forms, the order by which the BBs are
11542      traversed need to be reconsidered.  */
11543
11544   for (i = 0; i < nbbs; i++)
11545     {
11546       basic_block bb = bbs[i];
11547       stmt_vec_info stmt_info;
11548
11549       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11550            gsi_next (&si))
11551         {
11552           gphi *phi = si.phi ();
11553           if (dump_enabled_p ())
11554             dump_printf_loc (MSG_NOTE, vect_location,
11555                              "------>vectorizing phi: %G", (gimple *) phi);
11556           stmt_info = loop_vinfo->lookup_stmt (phi);
11557           if (!stmt_info)
11558             continue;
11559
11560           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11561             vect_loop_kill_debug_uses (loop, stmt_info);
11562
11563           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11564               && !STMT_VINFO_LIVE_P (stmt_info))
11565             continue;
11566
11567           if (STMT_VINFO_VECTYPE (stmt_info)
11568               && (maybe_ne
11569                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11570               && dump_enabled_p ())
11571             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11572
11573           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11574                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11575                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11576                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11577                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11578                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11579               && ! PURE_SLP_STMT (stmt_info))
11580             {
11581               if (dump_enabled_p ())
11582                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11583               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11584             }
11585         }
11586
11587       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11588            gsi_next (&si))
11589         {
11590           gphi *phi = si.phi ();
11591           stmt_info = loop_vinfo->lookup_stmt (phi);
11592           if (!stmt_info)
11593             continue;
11594
11595           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11596               && !STMT_VINFO_LIVE_P (stmt_info))
11597             continue;
11598
11599           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11600                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11601                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11602                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11603                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11604                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11605               && ! PURE_SLP_STMT (stmt_info))
11606             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11607         }
11608
11609       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11610            !gsi_end_p (si);)
11611         {
11612           stmt = gsi_stmt (si);
11613           /* During vectorization remove existing clobber stmts.  */
11614           if (gimple_clobber_p (stmt))
11615             {
11616               unlink_stmt_vdef (stmt);
11617               gsi_remove (&si, true);
11618               release_defs (stmt);
11619             }
11620           else
11621             {
11622               /* Ignore vector stmts created in the outer loop.  */
11623               stmt_info = loop_vinfo->lookup_stmt (stmt);
11624
11625               /* vector stmts created in the outer-loop during vectorization of
11626                  stmts in an inner-loop may not have a stmt_info, and do not
11627                  need to be vectorized.  */
11628               stmt_vec_info seen_store = NULL;
11629               if (stmt_info)
11630                 {
11631                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11632                     {
11633                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11634                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11635                            !gsi_end_p (subsi); gsi_next (&subsi))
11636                         {
11637                           stmt_vec_info pat_stmt_info
11638                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11639                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11640                                                     &si, &seen_store);
11641                         }
11642                       stmt_vec_info pat_stmt_info
11643                         = STMT_VINFO_RELATED_STMT (stmt_info);
11644                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11645                                                     &si, &seen_store))
11646                         maybe_set_vectorized_backedge_value (loop_vinfo,
11647                                                              pat_stmt_info);
11648                     }
11649                   else
11650                     {
11651                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11652                                                     &seen_store))
11653                         maybe_set_vectorized_backedge_value (loop_vinfo,
11654                                                              stmt_info);
11655                     }
11656                 }
11657               gsi_next (&si);
11658               if (seen_store)
11659                 {
11660                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11661                     /* Interleaving.  If IS_STORE is TRUE, the
11662                        vectorization of the interleaving chain was
11663                        completed - free all the stores in the chain.  */
11664                     vect_remove_stores (loop_vinfo,
11665                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11666                   else
11667                     /* Free the attached stmt_vec_info and remove the stmt.  */
11668                     loop_vinfo->remove_stmt (stmt_info);
11669                 }
11670             }
11671         }
11672
11673       /* Stub out scalar statements that must not survive vectorization.
11674          Doing this here helps with grouped statements, or statements that
11675          are involved in patterns.  */
11676       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11677            !gsi_end_p (gsi); gsi_next (&gsi))
11678         {
11679           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11680           if (!call || !gimple_call_internal_p (call))
11681             continue;
11682           internal_fn ifn = gimple_call_internal_fn (call);
11683           if (ifn == IFN_MASK_LOAD)
11684             {
11685               tree lhs = gimple_get_lhs (call);
11686               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11687                 {
11688                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11689                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11690                   gsi_replace (&gsi, new_stmt, true);
11691                 }
11692             }
11693           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11694             {
11695               tree lhs = gimple_get_lhs (call);
11696               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11697                 {
11698                   tree else_arg
11699                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11700                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11701                   gsi_replace (&gsi, new_stmt, true);
11702                 }
11703             }
11704         }
11705     }                           /* BBs in loop */
11706
11707   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11708      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11709   if (integer_onep (step_vector))
11710     niters_no_overflow = true;
11711   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11712                            niters_vector, step_vector, niters_vector_mult_vf,
11713                            !niters_no_overflow);
11714
11715   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11716
11717   /* True if the final iteration might not handle a full vector's
11718      worth of scalar iterations.  */
11719   bool final_iter_may_be_partial
11720     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11721   /* The minimum number of iterations performed by the epilogue.  This
11722      is 1 when peeling for gaps because we always need a final scalar
11723      iteration.  */
11724   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11725   /* +1 to convert latch counts to loop iteration counts,
11726      -min_epilogue_iters to remove iterations that cannot be performed
11727        by the vector code.  */
11728   int bias_for_lowest = 1 - min_epilogue_iters;
11729   int bias_for_assumed = bias_for_lowest;
11730   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11731   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11732     {
11733       /* When the amount of peeling is known at compile time, the first
11734          iteration will have exactly alignment_npeels active elements.
11735          In the worst case it will have at least one.  */
11736       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11737       bias_for_lowest += lowest_vf - min_first_active;
11738       bias_for_assumed += assumed_vf - min_first_active;
11739     }
11740   /* In these calculations the "- 1" converts loop iteration counts
11741      back to latch counts.  */
11742   if (loop->any_upper_bound)
11743     {
11744       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11745       loop->nb_iterations_upper_bound
11746         = (final_iter_may_be_partial
11747            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11748                             lowest_vf) - 1
11749            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11750                              lowest_vf) - 1);
11751       if (main_vinfo
11752           /* Both peeling for alignment and peeling for gaps can end up
11753              with the scalar epilogue running for more than VF-1 iterations.  */
11754           && !main_vinfo->peeling_for_alignment
11755           && !main_vinfo->peeling_for_gaps)
11756         {
11757           unsigned int bound;
11758           poly_uint64 main_iters
11759             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11760                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11761           main_iters
11762             = upper_bound (main_iters,
11763                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11764           if (can_div_away_from_zero_p (main_iters,
11765                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11766                                         &bound))
11767             loop->nb_iterations_upper_bound
11768               = wi::umin ((bound_wide_int) (bound - 1),
11769                           loop->nb_iterations_upper_bound);
11770       }
11771   }
11772   if (loop->any_likely_upper_bound)
11773     loop->nb_iterations_likely_upper_bound
11774       = (final_iter_may_be_partial
11775          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11776                           + bias_for_lowest, lowest_vf) - 1
11777          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11778                            + bias_for_lowest, lowest_vf) - 1);
11779   if (loop->any_estimate)
11780     loop->nb_iterations_estimate
11781       = (final_iter_may_be_partial
11782          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11783                           assumed_vf) - 1
11784          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11785                            assumed_vf) - 1);
11786   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11787                                assumed_vf, flat);
11788
11789   if (dump_enabled_p ())
11790     {
11791       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11792         {
11793           dump_printf_loc (MSG_NOTE, vect_location,
11794                            "LOOP VECTORIZED\n");
11795           if (loop->inner)
11796             dump_printf_loc (MSG_NOTE, vect_location,
11797                              "OUTER LOOP VECTORIZED\n");
11798           dump_printf (MSG_NOTE, "\n");
11799         }
11800       else
11801         dump_printf_loc (MSG_NOTE, vect_location,
11802                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11803                          GET_MODE_NAME (loop_vinfo->vector_mode));
11804     }
11805
11806   /* Loops vectorized with a variable factor won't benefit from
11807      unrolling/peeling.  */
11808   if (!vf.is_constant ())
11809     {
11810       loop->unroll = 1;
11811       if (dump_enabled_p ())
11812         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11813                          " variable-length vectorization factor\n");
11814     }
11815   /* Free SLP instances here because otherwise stmt reference counting
11816      won't work.  */
11817   slp_instance instance;
11818   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11819     vect_free_slp_instance (instance);
11820   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11821   /* Clear-up safelen field since its value is invalid after vectorization
11822      since vectorized loop can have loop-carried dependencies.  */
11823   loop->safelen = 0;
11824
11825   if (epilogue)
11826     {
11827       update_epilogue_loop_vinfo (epilogue, advance);
11828
11829       epilogue->simduid = loop->simduid;
11830       epilogue->force_vectorize = loop->force_vectorize;
11831       epilogue->dont_vectorize = false;
11832     }
11833
11834   return epilogue;
11835 }
11836
11837 /* The code below is trying to perform simple optimization - revert
11838    if-conversion for masked stores, i.e. if the mask of a store is zero
11839    do not perform it and all stored value producers also if possible.
11840    For example,
11841      for (i=0; i<n; i++)
11842        if (c[i])
11843         {
11844           p1[i] += 1;
11845           p2[i] = p3[i] +2;
11846         }
11847    this transformation will produce the following semi-hammock:
11848
11849    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11850      {
11851        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11852        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11853        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11854        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11855        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11856        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11857      }
11858 */
11859
11860 void
11861 optimize_mask_stores (class loop *loop)
11862 {
11863   basic_block *bbs = get_loop_body (loop);
11864   unsigned nbbs = loop->num_nodes;
11865   unsigned i;
11866   basic_block bb;
11867   class loop *bb_loop;
11868   gimple_stmt_iterator gsi;
11869   gimple *stmt;
11870   auto_vec<gimple *> worklist;
11871   auto_purge_vect_location sentinel;
11872
11873   vect_location = find_loop_location (loop);
11874   /* Pick up all masked stores in loop if any.  */
11875   for (i = 0; i < nbbs; i++)
11876     {
11877       bb = bbs[i];
11878       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11879            gsi_next (&gsi))
11880         {
11881           stmt = gsi_stmt (gsi);
11882           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11883             worklist.safe_push (stmt);
11884         }
11885     }
11886
11887   free (bbs);
11888   if (worklist.is_empty ())
11889     return;
11890
11891   /* Loop has masked stores.  */
11892   while (!worklist.is_empty ())
11893     {
11894       gimple *last, *last_store;
11895       edge e, efalse;
11896       tree mask;
11897       basic_block store_bb, join_bb;
11898       gimple_stmt_iterator gsi_to;
11899       tree vdef, new_vdef;
11900       gphi *phi;
11901       tree vectype;
11902       tree zero;
11903
11904       last = worklist.pop ();
11905       mask = gimple_call_arg (last, 2);
11906       bb = gimple_bb (last);
11907       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11908          the same loop as if_bb.  It could be different to LOOP when two
11909          level loop-nest is vectorized and mask_store belongs to the inner
11910          one.  */
11911       e = split_block (bb, last);
11912       bb_loop = bb->loop_father;
11913       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11914       join_bb = e->dest;
11915       store_bb = create_empty_bb (bb);
11916       add_bb_to_loop (store_bb, bb_loop);
11917       e->flags = EDGE_TRUE_VALUE;
11918       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11919       /* Put STORE_BB to likely part.  */
11920       efalse->probability = profile_probability::likely ();
11921       e->probability = efalse->probability.invert ();
11922       store_bb->count = efalse->count ();
11923       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11924       if (dom_info_available_p (CDI_DOMINATORS))
11925         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11926       if (dump_enabled_p ())
11927         dump_printf_loc (MSG_NOTE, vect_location,
11928                          "Create new block %d to sink mask stores.",
11929                          store_bb->index);
11930       /* Create vector comparison with boolean result.  */
11931       vectype = TREE_TYPE (mask);
11932       zero = build_zero_cst (vectype);
11933       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11934       gsi = gsi_last_bb (bb);
11935       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11936       /* Create new PHI node for vdef of the last masked store:
11937          .MEM_2 = VDEF <.MEM_1>
11938          will be converted to
11939          .MEM.3 = VDEF <.MEM_1>
11940          and new PHI node will be created in join bb
11941          .MEM_2 = PHI <.MEM_1, .MEM_3>
11942       */
11943       vdef = gimple_vdef (last);
11944       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11945       gimple_set_vdef (last, new_vdef);
11946       phi = create_phi_node (vdef, join_bb);
11947       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11948
11949       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11950       while (true)
11951         {
11952           gimple_stmt_iterator gsi_from;
11953           gimple *stmt1 = NULL;
11954
11955           /* Move masked store to STORE_BB.  */
11956           last_store = last;
11957           gsi = gsi_for_stmt (last);
11958           gsi_from = gsi;
11959           /* Shift GSI to the previous stmt for further traversal.  */
11960           gsi_prev (&gsi);
11961           gsi_to = gsi_start_bb (store_bb);
11962           gsi_move_before (&gsi_from, &gsi_to);
11963           /* Setup GSI_TO to the non-empty block start.  */
11964           gsi_to = gsi_start_bb (store_bb);
11965           if (dump_enabled_p ())
11966             dump_printf_loc (MSG_NOTE, vect_location,
11967                              "Move stmt to created bb\n%G", last);
11968           /* Move all stored value producers if possible.  */
11969           while (!gsi_end_p (gsi))
11970             {
11971               tree lhs;
11972               imm_use_iterator imm_iter;
11973               use_operand_p use_p;
11974               bool res;
11975
11976               /* Skip debug statements.  */
11977               if (is_gimple_debug (gsi_stmt (gsi)))
11978                 {
11979                   gsi_prev (&gsi);
11980                   continue;
11981                 }
11982               stmt1 = gsi_stmt (gsi);
11983               /* Do not consider statements writing to memory or having
11984                  volatile operand.  */
11985               if (gimple_vdef (stmt1)
11986                   || gimple_has_volatile_ops (stmt1))
11987                 break;
11988               gsi_from = gsi;
11989               gsi_prev (&gsi);
11990               lhs = gimple_get_lhs (stmt1);
11991               if (!lhs)
11992                 break;
11993
11994               /* LHS of vectorized stmt must be SSA_NAME.  */
11995               if (TREE_CODE (lhs) != SSA_NAME)
11996                 break;
11997
11998               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11999                 {
12000                   /* Remove dead scalar statement.  */
12001                   if (has_zero_uses (lhs))
12002                     {
12003                       gsi_remove (&gsi_from, true);
12004                       continue;
12005                     }
12006                 }
12007
12008               /* Check that LHS does not have uses outside of STORE_BB.  */
12009               res = true;
12010               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12011                 {
12012                   gimple *use_stmt;
12013                   use_stmt = USE_STMT (use_p);
12014                   if (is_gimple_debug (use_stmt))
12015                     continue;
12016                   if (gimple_bb (use_stmt) != store_bb)
12017                     {
12018                       res = false;
12019                       break;
12020                     }
12021                 }
12022               if (!res)
12023                 break;
12024
12025               if (gimple_vuse (stmt1)
12026                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12027                 break;
12028
12029               /* Can move STMT1 to STORE_BB.  */
12030               if (dump_enabled_p ())
12031                 dump_printf_loc (MSG_NOTE, vect_location,
12032                                  "Move stmt to created bb\n%G", stmt1);
12033               gsi_move_before (&gsi_from, &gsi_to);
12034               /* Shift GSI_TO for further insertion.  */
12035               gsi_prev (&gsi_to);
12036             }
12037           /* Put other masked stores with the same mask to STORE_BB.  */
12038           if (worklist.is_empty ()
12039               || gimple_call_arg (worklist.last (), 2) != mask
12040               || worklist.last () != stmt1)
12041             break;
12042           last = worklist.pop ();
12043         }
12044       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12045     }
12046 }
12047
12048 /* Decide whether it is possible to use a zero-based induction variable
12049    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12050    the value that the induction variable must be able to hold in order
12051    to ensure that the rgroups eventually have no active vector elements.
12052    Return -1 otherwise.  */
12053
12054 widest_int
12055 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12056 {
12057   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12058   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12059   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12060
12061   /* Calculate the value that the induction variable must be able
12062      to hit in order to ensure that we end the loop with an all-false mask.
12063      This involves adding the maximum number of inactive trailing scalar
12064      iterations.  */
12065   widest_int iv_limit = -1;
12066   if (max_loop_iterations (loop, &iv_limit))
12067     {
12068       if (niters_skip)
12069         {
12070           /* Add the maximum number of skipped iterations to the
12071              maximum iteration count.  */
12072           if (TREE_CODE (niters_skip) == INTEGER_CST)
12073             iv_limit += wi::to_widest (niters_skip);
12074           else
12075             iv_limit += max_vf - 1;
12076         }
12077       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12078         /* Make a conservatively-correct assumption.  */
12079         iv_limit += max_vf - 1;
12080
12081       /* IV_LIMIT is the maximum number of latch iterations, which is also
12082          the maximum in-range IV value.  Round this value down to the previous
12083          vector alignment boundary and then add an extra full iteration.  */
12084       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12085       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12086     }
12087   return iv_limit;
12088 }
12089
12090 /* For the given rgroup_controls RGC, check whether an induction variable
12091    would ever hit a value that produces a set of all-false masks or zero
12092    lengths before wrapping around.  Return true if it's possible to wrap
12093    around before hitting the desirable value, otherwise return false.  */
12094
12095 bool
12096 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12097 {
12098   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12099
12100   if (iv_limit == -1)
12101     return true;
12102
12103   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12104   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12105   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12106
12107   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12108     return true;
12109
12110   return false;
12111 }