gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       /* For now vect_get_loop_mask only supports integer mode masks
1466          when we need to split it.  */
1467       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1469         {
1470           ok = false;
1471           break;
1472         }
1473
1474       /* If iv_type is usable as compare type use that - we can elide the
1475          saturation in that case.   */
1476       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1477         {
1478           tree cmp_vectype
1479             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1480           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481             rgc.compare_type = cmp_vectype;
1482         }
1483       if (!rgc.compare_type)
1484         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1485           {
1486             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1487             if (cmp_bits >= min_ni_width
1488                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1489               {
1490                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491                 if (!cmp_type)
1492                   continue;
1493
1494                 /* Check whether we can produce the mask with cmp_type.  */
1495                 tree cmp_vectype
1496                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1497                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1498                   {
1499                     rgc.compare_type = cmp_vectype;
1500                     break;
1501                   }
1502               }
1503         }
1504       if (!rgc.compare_type)
1505         {
1506           ok = false;
1507           break;
1508         }
1509     }
1510   if (!ok)
1511     {
1512       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513       return false;
1514     }
1515
1516   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519   return true;
1520 }
1521
1522 /* Check whether we can use vector access with length based on precison
1523    comparison.  So far, to keep it simple, we only allow the case that the
1524    precision of the target supported length is larger than the precision
1525    required by loop niters.  */
1526
1527 static bool
1528 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1529 {
1530   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531     return false;
1532
1533   machine_mode len_load_mode, len_store_mode;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535          .exists (&len_load_mode))
1536     return false;
1537   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538          .exists (&len_store_mode))
1539     return false;
1540
1541   signed char partial_load_bias = internal_len_load_store_bias
1542     (IFN_LEN_LOAD, len_load_mode);
1543
1544   signed char partial_store_bias = internal_len_load_store_bias
1545     (IFN_LEN_STORE, len_store_mode);
1546
1547   gcc_assert (partial_load_bias == partial_store_bias);
1548
1549   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550     return false;
1551
1552   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553      len_loads with a length of zero.  In order to avoid that we prohibit
1554      more than one loop length here.  */
1555   if (partial_load_bias == -1
1556       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557     return false;
1558
1559   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1560
1561   unsigned int max_nitems_per_iter = 1;
1562   unsigned int i;
1563   rgroup_controls *rgl;
1564   /* Find the maximum number of items per iteration for every rgroup.  */
1565   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1566     {
1567       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1569     }
1570
1571   /* Work out how many bits we need to represent the length limit.  */
1572   unsigned int min_ni_prec
1573     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1574
1575   /* Now use the maximum of below precisions for one suitable IV type:
1576      - the IV's natural precision
1577      - the precision needed to hold: the maximum number of scalar
1578        iterations multiplied by the scale factor (min_ni_prec above)
1579      - the Pmode precision
1580
1581      If min_ni_prec is less than the precision of the current niters,
1582      we perfer to still use the niters type.  Prefer to use Pmode and
1583      wider IV to avoid narrow conversions.  */
1584
1585   unsigned int ni_prec
1586     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587   min_ni_prec = MAX (min_ni_prec, ni_prec);
1588   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1589
1590   tree iv_type = NULL_TREE;
1591   opt_scalar_int_mode tmode_iter;
1592   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1593     {
1594       scalar_mode tmode = tmode_iter.require ();
1595       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1596
1597       /* ??? Do we really want to construct one IV whose precision exceeds
1598          BITS_PER_WORD?  */
1599       if (tbits > BITS_PER_WORD)
1600         break;
1601
1602       /* Find the first available standard integral type.  */
1603       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1604         {
1605           iv_type = build_nonstandard_integer_type (tbits, true);
1606           break;
1607         }
1608     }
1609
1610   if (!iv_type)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614                          "can't vectorize with length-based partial vectors"
1615                          " because there is no suitable iv type.\n");
1616       return false;
1617     }
1618
1619   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1622
1623   return true;
1624 }
1625
1626 /* Calculate the cost of one scalar iteration of the loop.  */
1627 static void
1628 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1629 {
1630   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632   int nbbs = loop->num_nodes, factor;
1633   int innerloop_iters, i;
1634
1635   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1636
1637   /* Gather costs for statements in the scalar loop.  */
1638
1639   /* FORNOW.  */
1640   innerloop_iters = 1;
1641   if (loop->inner)
1642     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       gimple_stmt_iterator si;
1647       basic_block bb = bbs[i];
1648
1649       if (bb->loop_father == loop->inner)
1650         factor = innerloop_iters;
1651       else
1652         factor = 1;
1653
1654       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1655         {
1656           gimple *stmt = gsi_stmt (si);
1657           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1658
1659           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1660             continue;
1661
1662           /* Skip stmts that are not vectorized inside the loop.  */
1663           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665               && (!STMT_VINFO_LIVE_P (vstmt_info)
1666                   || !VECTORIZABLE_CYCLE_DEF
1667                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668             continue;
1669
1670           vect_cost_for_stmt kind;
1671           if (STMT_VINFO_DATA_REF (stmt_info))
1672             {
1673               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674                kind = scalar_load;
1675              else
1676                kind = scalar_store;
1677             }
1678           else if (vect_nop_conversion_p (stmt_info))
1679             continue;
1680           else
1681             kind = scalar_stmt;
1682
1683           /* We are using vect_prologue here to avoid scaling twice
1684              by the inner loop factor.  */
1685           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686                             factor, kind, stmt_info, 0, vect_prologue);
1687         }
1688     }
1689
1690   /* Now accumulate cost.  */
1691   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1692   add_stmt_costs (loop_vinfo->scalar_costs,
1693                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694   loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 }
1696
1697
1698 /* Function vect_analyze_loop_form.
1699
1700    Verify that certain CFG restrictions hold, including:
1701    - the loop has a pre-header
1702    - the loop has a single entry and exit
1703    - the loop exit condition is simple enough
1704    - the number of iterations can be analyzed, i.e, a countable loop.  The
1705      niter could be analyzed under some assumptions.  */
1706
1707 opt_result
1708 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1709 {
1710   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1711
1712   edge exit_e = vec_init_loop_exit_info (loop);
1713   if (!exit_e)
1714     return opt_result::failure_at (vect_location,
1715                                    "not vectorized:"
1716                                    " could not determine main exit from"
1717                                    " loop with multiple exits.\n");
1718   info->loop_exit = exit_e;
1719   if (dump_enabled_p ())
1720       dump_printf_loc (MSG_NOTE, vect_location,
1721                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1722                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1723
1724   /* Different restrictions apply when we are considering an inner-most loop,
1725      vs. an outer (nested) loop.
1726      (FORNOW. May want to relax some of these restrictions in the future).  */
1727
1728   info->inner_loop_cond = NULL;
1729   if (!loop->inner)
1730     {
1731       /* Inner-most loop.  We currently require that the number of BBs is
1732          exactly 2 (the header and latch).  Vectorizable inner-most loops
1733          look like this:
1734
1735                         (pre-header)
1736                            |
1737                           header <--------+
1738                            | |            |
1739                            | +--> latch --+
1740                            |
1741                         (exit-bb)  */
1742
1743       if (loop->num_nodes != 2)
1744         return opt_result::failure_at (vect_location,
1745                                        "not vectorized:"
1746                                        " control flow in loop.\n");
1747
1748       if (empty_block_p (loop->header))
1749         return opt_result::failure_at (vect_location,
1750                                        "not vectorized: empty loop.\n");
1751     }
1752   else
1753     {
1754       class loop *innerloop = loop->inner;
1755       edge entryedge;
1756
1757       /* Nested loop. We currently require that the loop is doubly-nested,
1758          contains a single inner loop, and the number of BBs is exactly 5.
1759          Vectorizable outer-loops look like this:
1760
1761                         (pre-header)
1762                            |
1763                           header <---+
1764                            |         |
1765                           inner-loop |
1766                            |         |
1767                           tail ------+
1768                            |
1769                         (exit-bb)
1770
1771          The inner-loop has the properties expected of inner-most loops
1772          as described above.  */
1773
1774       if ((loop->inner)->inner || (loop->inner)->next)
1775         return opt_result::failure_at (vect_location,
1776                                        "not vectorized:"
1777                                        " multiple nested loops.\n");
1778
1779       if (loop->num_nodes != 5)
1780         return opt_result::failure_at (vect_location,
1781                                        "not vectorized:"
1782                                        " control flow in loop.\n");
1783
1784       entryedge = loop_preheader_edge (innerloop);
1785       if (entryedge->src != loop->header
1786           || !single_exit (innerloop)
1787           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788         return opt_result::failure_at (vect_location,
1789                                        "not vectorized:"
1790                                        " unsupported outerloop form.\n");
1791
1792       /* Analyze the inner-loop.  */
1793       vect_loop_form_info inner;
1794       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1795       if (!res)
1796         {
1797           if (dump_enabled_p ())
1798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799                              "not vectorized: Bad inner loop.\n");
1800           return res;
1801         }
1802
1803       /* Don't support analyzing niter under assumptions for inner
1804          loop.  */
1805       if (!integer_onep (inner.assumptions))
1806         return opt_result::failure_at (vect_location,
1807                                        "not vectorized: Bad inner loop.\n");
1808
1809       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810         return opt_result::failure_at (vect_location,
1811                                        "not vectorized: inner-loop count not"
1812                                        " invariant.\n");
1813
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_NOTE, vect_location,
1816                          "Considering outer-loop vectorization.\n");
1817       info->inner_loop_cond = inner.conds[0];
1818     }
1819
1820   if (!single_exit (loop))
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized: multiple exits.\n");
1823   if (EDGE_COUNT (loop->header->preds) != 2)
1824     return opt_result::failure_at (vect_location,
1825                                    "not vectorized:"
1826                                    " too many incoming edges.\n");
1827
1828   /* We assume that the loop exit condition is at the end of the loop. i.e,
1829      that the loop is represented as a do-while (with a proper if-guard
1830      before the loop if needed), where the loop header contains all the
1831      executable statements, and the latch is empty.  */
1832   if (!empty_block_p (loop->latch)
1833       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1834     return opt_result::failure_at (vect_location,
1835                                    "not vectorized: latch block not empty.\n");
1836
1837   /* Make sure the exit is not abnormal.  */
1838   if (exit_e->flags & EDGE_ABNORMAL)
1839     return opt_result::failure_at (vect_location,
1840                                    "not vectorized:"
1841                                    " abnormal loop exit edge.\n");
1842
1843   info->conds
1844     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1845                             &info->number_of_iterations,
1846                             &info->number_of_iterationsm1);
1847
1848   if (info->conds.is_empty ())
1849     return opt_result::failure_at
1850       (vect_location,
1851        "not vectorized: complicated exit condition.\n");
1852
1853   /* Determine what the primary and alternate exit conds are.  */
1854   for (unsigned i = 0; i < info->conds.length (); i++)
1855     {
1856       gcond *cond = info->conds[i];
1857       if (exit_e->src == gimple_bb (cond))
1858         std::swap (info->conds[0], info->conds[i]);
1859     }
1860
1861   if (integer_zerop (info->assumptions)
1862       || !info->number_of_iterations
1863       || chrec_contains_undetermined (info->number_of_iterations))
1864     return opt_result::failure_at
1865       (info->conds[0],
1866        "not vectorized: number of iterations cannot be computed.\n");
1867
1868   if (integer_zerop (info->number_of_iterations))
1869     return opt_result::failure_at
1870       (info->conds[0],
1871        "not vectorized: number of iterations = 0.\n");
1872
1873   if (!(tree_fits_shwi_p (info->number_of_iterations)
1874         && tree_to_shwi (info->number_of_iterations) > 0))
1875     {
1876       if (dump_enabled_p ())
1877         {
1878           dump_printf_loc (MSG_NOTE, vect_location,
1879                            "Symbolic number of iterations is ");
1880           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881           dump_printf (MSG_NOTE, "\n");
1882         }
1883     }
1884
1885   return opt_result::success ();
1886 }
1887
1888 /* Create a loop_vec_info for LOOP with SHARED and the
1889    vect_analyze_loop_form result.  */
1890
1891 loop_vec_info
1892 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893                         const vect_loop_form_info *info,
1894                         loop_vec_info main_loop_info)
1895 {
1896   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901   /* Also record the assumptions for versioning.  */
1902   if (!integer_onep (info->assumptions) && !main_loop_info)
1903     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1904
1905   for (gcond *cond : info->conds)
1906     {
1907       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1909     }
1910
1911   for (unsigned i = 1; i < info->conds.length (); i ++)
1912     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1913   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1914
1915   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1916
1917   if (info->inner_loop_cond)
1918     {
1919       stmt_vec_info inner_loop_cond_info
1920         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922       /* If we have an estimate on the number of iterations of the inner
1923          loop use that to limit the scale for costing, otherwise use
1924          --param vect-inner-loop-cost-factor literally.  */
1925       widest_int nit;
1926       if (estimated_stmt_executions (loop->inner, &nit))
1927         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1929     }
1930
1931   return loop_vinfo;
1932 }
1933
1934
1935
1936 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937    statements update the vectorization factor.  */
1938
1939 static void
1940 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1941 {
1942   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944   int nbbs = loop->num_nodes;
1945   poly_uint64 vectorization_factor;
1946   int i;
1947
1948   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1949
1950   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951   gcc_assert (known_ne (vectorization_factor, 0U));
1952
1953   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954      vectorization factor of the loop is the unrolling factor required by
1955      the SLP instances.  If that unrolling factor is 1, we say, that we
1956      perform pure SLP on loop - cross iteration parallelism is not
1957      exploited.  */
1958   bool only_slp_in_loop = true;
1959   for (i = 0; i < nbbs; i++)
1960     {
1961       basic_block bb = bbs[i];
1962       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1963            gsi_next (&si))
1964         {
1965           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966           if (!stmt_info)
1967             continue;
1968           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970               && !PURE_SLP_STMT (stmt_info))
1971             /* STMT needs both SLP and loop-based vectorization.  */
1972             only_slp_in_loop = false;
1973         }
1974       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1975            gsi_next (&si))
1976         {
1977           if (is_gimple_debug (gsi_stmt (si)))
1978             continue;
1979           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1980           stmt_info = vect_stmt_to_vectorize (stmt_info);
1981           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983               && !PURE_SLP_STMT (stmt_info))
1984             /* STMT needs both SLP and loop-based vectorization.  */
1985             only_slp_in_loop = false;
1986         }
1987     }
1988
1989   if (only_slp_in_loop)
1990     {
1991       if (dump_enabled_p ())
1992         dump_printf_loc (MSG_NOTE, vect_location,
1993                          "Loop contains only SLP stmts\n");
1994       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1995     }
1996   else
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_NOTE, vect_location,
2000                          "Loop contains SLP and non-SLP stmts\n");
2001       /* Both the vectorization factor and unroll factor have the form
2002          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003          so they must have a common multiple.  */
2004       vectorization_factor
2005         = force_common_multiple (vectorization_factor,
2006                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2007     }
2008
2009   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010   if (dump_enabled_p ())
2011     {
2012       dump_printf_loc (MSG_NOTE, vect_location,
2013                        "Updating vectorization factor to ");
2014       dump_dec (MSG_NOTE, vectorization_factor);
2015       dump_printf (MSG_NOTE, ".\n");
2016     }
2017 }
2018
2019 /* Return true if STMT_INFO describes a double reduction phi and if
2020    the other phi in the reduction is also relevant for vectorization.
2021    This rejects cases such as:
2022
2023       outer1:
2024         x_1 = PHI <x_3(outer2), ...>;
2025         ...
2026
2027       inner:
2028         x_2 = ...;
2029         ...
2030
2031       outer2:
2032         x_3 = PHI <x_2(inner)>;
2033
2034    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2035
2036 static bool
2037 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2038 {
2039   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040     return false;
2041
2042   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2043 }
2044
2045 /* Function vect_analyze_loop_operations.
2046
2047    Scan the loop stmts and make sure they are all vectorizable.  */
2048
2049 static opt_result
2050 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2051 {
2052   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054   int nbbs = loop->num_nodes;
2055   int i;
2056   stmt_vec_info stmt_info;
2057   bool need_to_vectorize = false;
2058   bool ok;
2059
2060   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2061
2062   auto_vec<stmt_info_for_cost> cost_vec;
2063
2064   for (i = 0; i < nbbs; i++)
2065     {
2066       basic_block bb = bbs[i];
2067
2068       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2069            gsi_next (&si))
2070         {
2071           gphi *phi = si.phi ();
2072           ok = true;
2073
2074           stmt_info = loop_vinfo->lookup_stmt (phi);
2075           if (dump_enabled_p ())
2076             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077                              (gimple *) phi);
2078           if (virtual_operand_p (gimple_phi_result (phi)))
2079             continue;
2080
2081           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082              (i.e., a phi in the tail of the outer-loop).  */
2083           if (! is_loop_header_bb_p (bb))
2084             {
2085               /* FORNOW: we currently don't support the case that these phis
2086                  are not used in the outerloop (unless it is double reduction,
2087                  i.e., this phi is vect_reduction_def), cause this case
2088                  requires to actually do something here.  */
2089               if (STMT_VINFO_LIVE_P (stmt_info)
2090                   && !vect_active_double_reduction_p (stmt_info))
2091                 return opt_result::failure_at (phi,
2092                                                "Unsupported loop-closed phi"
2093                                                " in outer-loop.\n");
2094
2095               /* If PHI is used in the outer loop, we check that its operand
2096                  is defined in the inner loop.  */
2097               if (STMT_VINFO_RELEVANT_P (stmt_info))
2098                 {
2099                   tree phi_op;
2100
2101                   if (gimple_phi_num_args (phi) != 1)
2102                     return opt_result::failure_at (phi, "unsupported phi");
2103
2104                   phi_op = PHI_ARG_DEF (phi, 0);
2105                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106                   if (!op_def_info)
2107                     return opt_result::failure_at (phi, "unsupported phi\n");
2108
2109                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110                       && (STMT_VINFO_RELEVANT (op_def_info)
2111                           != vect_used_in_outer_by_reduction))
2112                     return opt_result::failure_at (phi, "unsupported phi\n");
2113
2114                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2116                            == vect_double_reduction_def))
2117                       && !vectorizable_lc_phi (loop_vinfo,
2118                                                stmt_info, NULL, NULL))
2119                     return opt_result::failure_at (phi, "unsupported phi\n");
2120                 }
2121
2122               continue;
2123             }
2124
2125           gcc_assert (stmt_info);
2126
2127           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128                || STMT_VINFO_LIVE_P (stmt_info))
2129               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131             /* A scalar-dependence cycle that we don't support.  */
2132             return opt_result::failure_at (phi,
2133                                            "not vectorized:"
2134                                            " scalar dependence cycle.\n");
2135
2136           if (STMT_VINFO_RELEVANT_P (stmt_info))
2137             {
2138               need_to_vectorize = true;
2139               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140                   && ! PURE_SLP_STMT (stmt_info))
2141                 ok = vectorizable_induction (loop_vinfo,
2142                                              stmt_info, NULL, NULL,
2143                                              &cost_vec);
2144               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2146                             == vect_double_reduction_def)
2147                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148                        && ! PURE_SLP_STMT (stmt_info))
2149                 ok = vectorizable_reduction (loop_vinfo,
2150                                              stmt_info, NULL, NULL, &cost_vec);
2151               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152                         == vect_first_order_recurrence)
2153                        && ! PURE_SLP_STMT (stmt_info))
2154                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155                                            &cost_vec);
2156             }
2157
2158           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2159           if (ok
2160               && STMT_VINFO_LIVE_P (stmt_info)
2161               && !PURE_SLP_STMT (stmt_info))
2162             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163                                               -1, false, &cost_vec);
2164
2165           if (!ok)
2166             return opt_result::failure_at (phi,
2167                                            "not vectorized: relevant phi not "
2168                                            "supported: %G",
2169                                            static_cast <gimple *> (phi));
2170         }
2171
2172       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2173            gsi_next (&si))
2174         {
2175           gimple *stmt = gsi_stmt (si);
2176           if (!gimple_clobber_p (stmt)
2177               && !is_gimple_debug (stmt))
2178             {
2179               opt_result res
2180                 = vect_analyze_stmt (loop_vinfo,
2181                                      loop_vinfo->lookup_stmt (stmt),
2182                                      &need_to_vectorize,
2183                                      NULL, NULL, &cost_vec);
2184               if (!res)
2185                 return res;
2186             }
2187         }
2188     } /* bbs */
2189
2190   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2191
2192   /* All operations in the loop are either irrelevant (deal with loop
2193      control, or dead), or only used outside the loop and can be moved
2194      out of the loop (e.g. invariants, inductions).  The loop can be
2195      optimized away by scalar optimizations.  We're better off not
2196      touching this loop.  */
2197   if (!need_to_vectorize)
2198     {
2199       if (dump_enabled_p ())
2200         dump_printf_loc (MSG_NOTE, vect_location,
2201                          "All the computation can be taken out of the loop.\n");
2202       return opt_result::failure_at
2203         (vect_location,
2204          "not vectorized: redundant loop. no profit to vectorize.\n");
2205     }
2206
2207   return opt_result::success ();
2208 }
2209
2210 /* Return true if we know that the iteration count is smaller than the
2211    vectorization factor.  Return false if it isn't, or if we can't be sure
2212    either way.  */
2213
2214 static bool
2215 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2216 {
2217   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2218
2219   HOST_WIDE_INT max_niter;
2220   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222   else
2223     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2224
2225   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226     return true;
2227
2228   return false;
2229 }
2230
2231 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2232    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2233    definitely no, or -1 if it's worth retrying.  */
2234
2235 static int
2236 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237                            unsigned *suggested_unroll_factor)
2238 {
2239   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2241
2242   /* Only loops that can handle partially-populated vectors can have iteration
2243      counts less than the vectorization factor.  */
2244   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245       && vect_known_niters_smaller_than_vf (loop_vinfo))
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "not vectorized: iteration count smaller than "
2250                          "vectorization factor.\n");
2251       return 0;
2252     }
2253
2254   /* If we know the number of iterations we can do better, for the
2255      epilogue we can also decide whether the main loop leaves us
2256      with enough iterations, prefering a smaller vector epilog then
2257      also possibly used for the case we skip the vector loop.  */
2258   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2259     {
2260       widest_int scalar_niters
2261         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2263         {
2264           loop_vec_info orig_loop_vinfo
2265             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266           unsigned lowest_vf
2267             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268           int prolog_peeling = 0;
2269           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271           if (prolog_peeling >= 0
2272               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273                            lowest_vf))
2274             {
2275               unsigned gap
2276                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278                                % lowest_vf + gap);
2279             }
2280         }
2281       /* Reject vectorizing for a single scalar iteration, even if
2282          we could in principle implement that using partial vectors.  */
2283       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284       if (scalar_niters <= peeling_gap + 1)
2285         {
2286           if (dump_enabled_p ())
2287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288                              "not vectorized: loop only has a single "
2289                              "scalar iteration.\n");
2290           return 0;
2291         }
2292
2293       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294         {
2295           /* Check that the loop processes at least one full vector.  */
2296           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297           if (known_lt (scalar_niters, vf))
2298             {
2299               if (dump_enabled_p ())
2300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                                  "loop does not have enough iterations "
2302                                  "to support vectorization.\n");
2303               return 0;
2304             }
2305
2306           /* If we need to peel an extra epilogue iteration to handle data
2307              accesses with gaps, check that there are enough scalar iterations
2308              available.
2309
2310              The check above is redundant with this one when peeling for gaps,
2311              but the distinction is useful for diagnostics.  */
2312           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313               && known_le (scalar_niters, vf))
2314             {
2315               if (dump_enabled_p ())
2316                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                                  "loop does not have enough iterations "
2318                                  "to support peeling for gaps.\n");
2319               return 0;
2320             }
2321         }
2322     }
2323
2324   /* If using the "very cheap" model. reject cases in which we'd keep
2325      a copy of the scalar code (even if we might be able to vectorize it).  */
2326   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                          "some scalar iterations would need to be peeled\n");
2334       return 0;
2335     }
2336
2337   int min_profitable_iters, min_profitable_estimate;
2338   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339                                       &min_profitable_estimate,
2340                                       suggested_unroll_factor);
2341
2342   if (min_profitable_iters < 0)
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vectorization not profitable.\n");
2347       if (dump_enabled_p ())
2348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349                          "not vectorized: vector version will never be "
2350                          "profitable.\n");
2351       return -1;
2352     }
2353
2354   int min_scalar_loop_bound = (param_min_vect_loop_bound
2355                                * assumed_vf);
2356
2357   /* Use the cost model only if it is more conservative than user specified
2358      threshold.  */
2359   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360                                     min_profitable_iters);
2361
2362   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2363
2364   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                          "not vectorized: vectorization not profitable.\n");
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_NOTE, vect_location,
2372                          "not vectorized: iteration count smaller than user "
2373                          "specified loop bound parameter or minimum profitable "
2374                          "iterations (whichever is more conservative).\n");
2375       return 0;
2376     }
2377
2378   /* The static profitablity threshold min_profitable_estimate includes
2379      the cost of having to check at runtime whether the scalar loop
2380      should be used instead.  If it turns out that we don't need or want
2381      such a check, the threshold we should use for the static estimate
2382      is simply the point at which the vector loop becomes more profitable
2383      than the scalar loop.  */
2384   if (min_profitable_estimate > min_profitable_iters
2385       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2389     {
2390       if (dump_enabled_p ())
2391         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392                          " choice between the scalar and vector loops\n");
2393       min_profitable_estimate = min_profitable_iters;
2394     }
2395
2396   /* If the vector loop needs multiple iterations to be beneficial then
2397      things are probably too close to call, and the conservative thing
2398      would be to stick with the scalar code.  */
2399   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2401     {
2402       if (dump_enabled_p ())
2403         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404                          "one iteration of the vector loop would be"
2405                          " more expensive than the equivalent number of"
2406                          " iterations of the scalar loop\n");
2407       return 0;
2408     }
2409
2410   HOST_WIDE_INT estimated_niter;
2411
2412   /* If we are vectorizing an epilogue then we know the maximum number of
2413      scalar iterations it will cover is at least one lower than the
2414      vectorization factor of the main loop.  */
2415   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416     estimated_niter
2417       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418   else
2419     {
2420       estimated_niter = estimated_stmt_executions_int (loop);
2421       if (estimated_niter == -1)
2422         estimated_niter = likely_max_stmt_executions_int (loop);
2423     }
2424   if (estimated_niter != -1
2425       && ((unsigned HOST_WIDE_INT) estimated_niter
2426           < MAX (th, (unsigned) min_profitable_estimate)))
2427     {
2428       if (dump_enabled_p ())
2429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430                          "not vectorized: estimated iteration count too "
2431                          "small.\n");
2432       if (dump_enabled_p ())
2433         dump_printf_loc (MSG_NOTE, vect_location,
2434                          "not vectorized: estimated iteration count smaller "
2435                          "than specified loop bound parameter or minimum "
2436                          "profitable iterations (whichever is more "
2437                          "conservative).\n");
2438       return -1;
2439     }
2440
2441   return 1;
2442 }
2443
2444 static opt_result
2445 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446                            vec<data_reference_p> *datarefs,
2447                            unsigned int *n_stmts)
2448 {
2449   *n_stmts = 0;
2450   for (unsigned i = 0; i < loop->num_nodes; i++)
2451     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2452          !gsi_end_p (gsi); gsi_next (&gsi))
2453       {
2454         gimple *stmt = gsi_stmt (gsi);
2455         if (is_gimple_debug (stmt))
2456           continue;
2457         ++(*n_stmts);
2458         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459                                                         NULL, 0);
2460         if (!res)
2461           {
2462             if (is_gimple_call (stmt) && loop->safelen)
2463               {
2464                 tree fndecl = gimple_call_fndecl (stmt), op;
2465                 if (fndecl == NULL_TREE
2466                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2467                   {
2468                     fndecl = gimple_call_arg (stmt, 0);
2469                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470                     fndecl = TREE_OPERAND (fndecl, 0);
2471                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2472                   }
2473                 if (fndecl != NULL_TREE)
2474                   {
2475                     cgraph_node *node = cgraph_node::get (fndecl);
2476                     if (node != NULL && node->simd_clones != NULL)
2477                       {
2478                         unsigned int j, n = gimple_call_num_args (stmt);
2479                         for (j = 0; j < n; j++)
2480                           {
2481                             op = gimple_call_arg (stmt, j);
2482                             if (DECL_P (op)
2483                                 || (REFERENCE_CLASS_P (op)
2484                                     && get_base_address (op)))
2485                               break;
2486                           }
2487                         op = gimple_call_lhs (stmt);
2488                         /* Ignore #pragma omp declare simd functions
2489                            if they don't have data references in the
2490                            call stmt itself.  */
2491                         if (j == n
2492                             && !(op
2493                                  && (DECL_P (op)
2494                                      || (REFERENCE_CLASS_P (op)
2495                                          && get_base_address (op)))))
2496                           continue;
2497                       }
2498                   }
2499               }
2500             return res;
2501           }
2502         /* If dependence analysis will give up due to the limit on the
2503            number of datarefs stop here and fail fatally.  */
2504         if (datarefs->length ()
2505             > (unsigned)param_loop_max_datarefs_for_datadeps)
2506           return opt_result::failure_at (stmt, "exceeded param "
2507                                          "loop-max-datarefs-for-datadeps\n");
2508       }
2509   return opt_result::success ();
2510 }
2511
2512 /* Look for SLP-only access groups and turn each individual access into its own
2513    group.  */
2514 static void
2515 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2516 {
2517   unsigned int i;
2518   struct data_reference *dr;
2519
2520   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2521
2522   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523   FOR_EACH_VEC_ELT (datarefs, i, dr)
2524     {
2525       gcc_assert (DR_REF (dr));
2526       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2527
2528       /* Check if the load is a part of an interleaving chain.  */
2529       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2530         {
2531           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533           unsigned int group_size = DR_GROUP_SIZE (first_element);
2534
2535           /* Check if SLP-only groups.  */
2536           if (!STMT_SLP_TYPE (stmt_info)
2537               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2538             {
2539               /* Dissolve the group.  */
2540               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2541
2542               stmt_vec_info vinfo = first_element;
2543               while (vinfo)
2544                 {
2545                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548                   DR_GROUP_SIZE (vinfo) = 1;
2549                   if (STMT_VINFO_STRIDED_P (first_element)
2550                       /* We cannot handle stores with gaps.  */
2551                       || DR_IS_WRITE (dr_info->dr))
2552                     {
2553                       STMT_VINFO_STRIDED_P (vinfo) = true;
2554                       DR_GROUP_GAP (vinfo) = 0;
2555                     }
2556                   else
2557                     DR_GROUP_GAP (vinfo) = group_size - 1;
2558                   /* Duplicate and adjust alignment info, it needs to
2559                      be present on each group leader, see dr_misalignment.  */
2560                   if (vinfo != first_element)
2561                     {
2562                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563                       dr_info2->target_alignment = dr_info->target_alignment;
2564                       int misalignment = dr_info->misalignment;
2565                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2566                         {
2567                           HOST_WIDE_INT diff
2568                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570                           unsigned HOST_WIDE_INT align_c
2571                             = dr_info->target_alignment.to_constant ();
2572                           misalignment = (misalignment + diff) % align_c;
2573                         }
2574                       dr_info2->misalignment = misalignment;
2575                     }
2576                   vinfo = next;
2577                 }
2578             }
2579         }
2580     }
2581 }
2582
2583 /* Determine if operating on full vectors for LOOP_VINFO might leave
2584    some scalar iterations still to do.  If so, decide how we should
2585    handle those scalar iterations.  The possibilities are:
2586
2587    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588        In this case:
2589
2590          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592          LOOP_VINFO_PEELING_FOR_NITER == false
2593
2594    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595        to handle the remaining scalar iterations.  In this case:
2596
2597          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598          LOOP_VINFO_PEELING_FOR_NITER == true
2599
2600        There are two choices:
2601
2602        (2a) Consider vectorizing the epilogue loop at the same VF as the
2603             main loop, but using partial vectors instead of full vectors.
2604             In this case:
2605
2606               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2607
2608        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609             In this case:
2610
2611               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2612  */
2613
2614 opt_result
2615 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2616 {
2617   /* Determine whether there would be any scalar iterations left over.  */
2618   bool need_peeling_or_partial_vectors_p
2619     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2620
2621   /* Decide whether to vectorize the loop with partial vectors.  */
2622   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625       && need_peeling_or_partial_vectors_p)
2626     {
2627       /* For partial-vector-usage=1, try to push the handling of partial
2628          vectors to the epilogue, with the main loop continuing to operate
2629          on full vectors.
2630
2631          If we are unrolling we also do not want to use partial vectors. This
2632          is to avoid the overhead of generating multiple masks and also to
2633          avoid having to execute entire iterations of FALSE masked instructions
2634          when dealing with one or less full iterations.
2635
2636          ??? We could then end up failing to use partial vectors if we
2637          decide to peel iterations into a prologue, and if the main loop
2638          then ends up processing fewer than VF iterations.  */
2639       if ((param_vect_partial_vector_usage == 1
2640            || loop_vinfo->suggested_unroll_factor > 1)
2641           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644       else
2645         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2646     }
2647
2648   if (dump_enabled_p ())
2649     dump_printf_loc (MSG_NOTE, vect_location,
2650                      "operating on %s vectors%s.\n",
2651                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652                      ? "partial" : "full",
2653                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654                      ? " for epilogue loop" : "");
2655
2656   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658        && need_peeling_or_partial_vectors_p);
2659
2660   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2661      analysis that we don't know whether the loop is vectorized by partial
2662      vectors (More details see tree-vect-loop-manip.cc).
2663
2664      However, SELECT_VL vectorizaton style should only applied on partial
2665      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2666      number of elements to be process for each iteration.
2667
2668      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2669      if it is not partial vectorized loop.  */
2670   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2671     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2672
2673   return opt_result::success ();
2674 }
2675
2676 /* Function vect_analyze_loop_2.
2677
2678    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2679    analyses will record information in some members of LOOP_VINFO.  FATAL
2680    indicates if some analysis meets fatal error.  If one non-NULL pointer
2681    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2682    worked out suggested unroll factor, while one NULL pointer shows it's
2683    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2684    is to hold the slp decision when the suggested unroll factor is worked
2685    out.  */
2686 static opt_result
2687 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2688                      unsigned *suggested_unroll_factor,
2689                      bool& slp_done_for_suggested_uf)
2690 {
2691   opt_result ok = opt_result::success ();
2692   int res;
2693   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2694   poly_uint64 min_vf = 2;
2695   loop_vec_info orig_loop_vinfo = NULL;
2696
2697   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2698      loop_vec_info of the first vectorized loop.  */
2699   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2700     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2701   else
2702     orig_loop_vinfo = loop_vinfo;
2703   gcc_assert (orig_loop_vinfo);
2704
2705   /* The first group of checks is independent of the vector size.  */
2706   fatal = true;
2707
2708   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2709       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2710     return opt_result::failure_at (vect_location,
2711                                    "not vectorized: simd if(0)\n");
2712
2713   /* Find all data references in the loop (which correspond to vdefs/vuses)
2714      and analyze their evolution in the loop.  */
2715
2716   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2717
2718   /* Gather the data references and count stmts in the loop.  */
2719   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2720     {
2721       opt_result res
2722         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2723                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2724                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2725       if (!res)
2726         {
2727           if (dump_enabled_p ())
2728             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729                              "not vectorized: loop contains function "
2730                              "calls or data references that cannot "
2731                              "be analyzed\n");
2732           return res;
2733         }
2734       loop_vinfo->shared->save_datarefs ();
2735     }
2736   else
2737     loop_vinfo->shared->check_datarefs ();
2738
2739   /* Analyze the data references and also adjust the minimal
2740      vectorization factor according to the loads and stores.  */
2741
2742   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2743   if (!ok)
2744     {
2745       if (dump_enabled_p ())
2746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2747                          "bad data references.\n");
2748       return ok;
2749     }
2750
2751   /* Check if we are applying unroll factor now.  */
2752   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2753   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2754
2755   /* If the slp decision is false when suggested unroll factor is worked
2756      out, and we are applying suggested unroll factor, we can simply skip
2757      all slp related analyses this time.  */
2758   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2759
2760   /* Classify all cross-iteration scalar data-flow cycles.
2761      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2762   vect_analyze_scalar_cycles (loop_vinfo, slp);
2763
2764   vect_pattern_recog (loop_vinfo);
2765
2766   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2767
2768   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2769      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2770
2771   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2772   if (!ok)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "bad data access.\n");
2777       return ok;
2778     }
2779
2780   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2781
2782   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2783   if (!ok)
2784     {
2785       if (dump_enabled_p ())
2786         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787                          "unexpected pattern.\n");
2788       return ok;
2789     }
2790
2791   /* While the rest of the analysis below depends on it in some way.  */
2792   fatal = false;
2793
2794   /* Analyze data dependences between the data-refs in the loop
2795      and adjust the maximum vectorization factor according to
2796      the dependences.
2797      FORNOW: fail at the first data dependence that we encounter.  */
2798
2799   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2800   if (!ok)
2801     {
2802       if (dump_enabled_p ())
2803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804                          "bad data dependence.\n");
2805       return ok;
2806     }
2807   if (max_vf != MAX_VECTORIZATION_FACTOR
2808       && maybe_lt (max_vf, min_vf))
2809     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2810   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2811
2812   ok = vect_determine_vectorization_factor (loop_vinfo);
2813   if (!ok)
2814     {
2815       if (dump_enabled_p ())
2816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817                          "can't determine vectorization factor.\n");
2818       return ok;
2819     }
2820
2821   /* Compute the scalar iteration cost.  */
2822   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2823
2824   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2825
2826   if (slp)
2827     {
2828       /* Check the SLP opportunities in the loop, analyze and build
2829          SLP trees.  */
2830       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2831       if (!ok)
2832         return ok;
2833
2834       /* If there are any SLP instances mark them as pure_slp.  */
2835       slp = vect_make_slp_decision (loop_vinfo);
2836       if (slp)
2837         {
2838           /* Find stmts that need to be both vectorized and SLPed.  */
2839           vect_detect_hybrid_slp (loop_vinfo);
2840
2841           /* Update the vectorization factor based on the SLP decision.  */
2842           vect_update_vf_for_slp (loop_vinfo);
2843
2844           /* Optimize the SLP graph with the vectorization factor fixed.  */
2845           vect_optimize_slp (loop_vinfo);
2846
2847           /* Gather the loads reachable from the SLP graph entries.  */
2848           vect_gather_slp_loads (loop_vinfo);
2849         }
2850     }
2851
2852   bool saved_can_use_partial_vectors_p
2853     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2854
2855   /* We don't expect to have to roll back to anything other than an empty
2856      set of rgroups.  */
2857   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2858
2859   /* This is the point where we can re-start analysis with SLP forced off.  */
2860 start_over:
2861
2862   /* Apply the suggested unrolling factor, this was determined by the backend
2863      during finish_cost the first time we ran the analyzis for this
2864      vector mode.  */
2865   if (applying_suggested_uf)
2866     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2867
2868   /* Now the vectorization factor is final.  */
2869   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2870   gcc_assert (known_ne (vectorization_factor, 0U));
2871
2872   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2873     {
2874       dump_printf_loc (MSG_NOTE, vect_location,
2875                        "vectorization_factor = ");
2876       dump_dec (MSG_NOTE, vectorization_factor);
2877       dump_printf (MSG_NOTE, ", niters = %wd\n",
2878                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2879     }
2880
2881   if (max_vf != MAX_VECTORIZATION_FACTOR
2882       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2883     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2884
2885   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2886
2887   /* Analyze the alignment of the data-refs in the loop.
2888      Fail if a data reference is found that cannot be vectorized.  */
2889
2890   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2891   if (!ok)
2892     {
2893       if (dump_enabled_p ())
2894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2895                          "bad data alignment.\n");
2896       return ok;
2897     }
2898
2899   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2900      It is important to call pruning after vect_analyze_data_ref_accesses,
2901      since we use grouping information gathered by interleaving analysis.  */
2902   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2903   if (!ok)
2904     return ok;
2905
2906   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2907      vectorization, since we do not want to add extra peeling or
2908      add versioning for alignment.  */
2909   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2910     /* This pass will decide on using loop versioning and/or loop peeling in
2911        order to enhance the alignment of data references in the loop.  */
2912     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2913   if (!ok)
2914     return ok;
2915
2916   if (slp)
2917     {
2918       /* Analyze operations in the SLP instances.  Note this may
2919          remove unsupported SLP instances which makes the above
2920          SLP kind detection invalid.  */
2921       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2922       vect_slp_analyze_operations (loop_vinfo);
2923       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2924         {
2925           ok = opt_result::failure_at (vect_location,
2926                                        "unsupported SLP instances\n");
2927           goto again;
2928         }
2929
2930       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2931       slp_tree load_node, slp_root;
2932       unsigned i, x;
2933       slp_instance instance;
2934       bool can_use_lanes = true;
2935       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2936         {
2937           slp_root = SLP_INSTANCE_TREE (instance);
2938           int group_size = SLP_TREE_LANES (slp_root);
2939           tree vectype = SLP_TREE_VECTYPE (slp_root);
2940           bool loads_permuted = false;
2941           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2942             {
2943               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2944                 continue;
2945               unsigned j;
2946               stmt_vec_info load_info;
2947               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2948                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2949                   {
2950                     loads_permuted = true;
2951                     break;
2952                   }
2953             }
2954
2955           /* If the loads and stores can be handled with load/store-lane
2956              instructions record it and move on to the next instance.  */
2957           if (loads_permuted
2958               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2959               && vect_store_lanes_supported (vectype, group_size, false)
2960                    != IFN_LAST)
2961             {
2962               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2963                 if (STMT_VINFO_GROUPED_ACCESS
2964                       (SLP_TREE_REPRESENTATIVE (load_node)))
2965                   {
2966                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2967                         (SLP_TREE_REPRESENTATIVE (load_node));
2968                     /* Use SLP for strided accesses (or if we can't
2969                        load-lanes).  */
2970                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2971                         || vect_load_lanes_supported
2972                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2973                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2974                       break;
2975                   }
2976
2977               can_use_lanes
2978                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2979
2980               if (can_use_lanes && dump_enabled_p ())
2981                 dump_printf_loc (MSG_NOTE, vect_location,
2982                                  "SLP instance %p can use load/store-lanes\n",
2983                                  (void *) instance);
2984             }
2985           else
2986             {
2987               can_use_lanes = false;
2988               break;
2989             }
2990         }
2991
2992       /* If all SLP instances can use load/store-lanes abort SLP and try again
2993          with SLP disabled.  */
2994       if (can_use_lanes)
2995         {
2996           ok = opt_result::failure_at (vect_location,
2997                                        "Built SLP cancelled: can use "
2998                                        "load/store-lanes\n");
2999           if (dump_enabled_p ())
3000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                              "Built SLP cancelled: all SLP instances support "
3002                              "load/store-lanes\n");
3003           goto again;
3004         }
3005     }
3006
3007   /* Dissolve SLP-only groups.  */
3008   vect_dissolve_slp_only_groups (loop_vinfo);
3009
3010   /* Scan all the remaining operations in the loop that are not subject
3011      to SLP and make sure they are vectorizable.  */
3012   ok = vect_analyze_loop_operations (loop_vinfo);
3013   if (!ok)
3014     {
3015       if (dump_enabled_p ())
3016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017                          "bad operation or unsupported loop bound.\n");
3018       return ok;
3019     }
3020
3021   /* For now, we don't expect to mix both masking and length approaches for one
3022      loop, disable it if both are recorded.  */
3023   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3024       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3025       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3026     {
3027       if (dump_enabled_p ())
3028         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029                          "can't vectorize a loop with partial vectors"
3030                          " because we don't expect to mix different"
3031                          " approaches with partial vectors for the"
3032                          " same loop.\n");
3033       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3034     }
3035
3036   /* If we still have the option of using partial vectors,
3037      check whether we can generate the necessary loop controls.  */
3038   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3039     {
3040       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3041         {
3042           if (!vect_verify_full_masking (loop_vinfo)
3043               && !vect_verify_full_masking_avx512 (loop_vinfo))
3044             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3045         }
3046       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3047         if (!vect_verify_loop_lens (loop_vinfo))
3048           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3049     }
3050
3051   /* If we're vectorizing a loop that uses length "controls" and
3052      can iterate more than once, we apply decrementing IV approach
3053      in loop control.  */
3054   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3055       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3056       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3057       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3058            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3059                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3060     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3061
3062   /* If a loop uses length controls and has a decrementing loop control IV,
3063      we will normally pass that IV through a MIN_EXPR to calcaluate the
3064      basis for the length controls.  E.g. in a loop that processes one
3065      element per scalar iteration, the number of elements would be
3066      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3067
3068      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3069      step, since only the final iteration of the vector loop can have
3070      inactive lanes.
3071
3072      However, some targets have a dedicated instruction for calculating the
3073      preferred length, given the total number of elements that still need to
3074      be processed.  This is encapsulated in the SELECT_VL internal function.
3075
3076      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3077      to determine the basis for the length controls.  However, unlike the
3078      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3079      lanes inactive in any iteration of the vector loop, not just the last
3080      iteration.  This SELECT_VL approach therefore requires us to use pointer
3081      IVs with variable steps.
3082
3083      Once we've decided how many elements should be processed by one
3084      iteration of the vector loop, we need to populate the rgroup controls.
3085      If a loop has multiple rgroups, we need to make sure that those rgroups
3086      "line up" (that is, they must be consistent about which elements are
3087      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3088
3089      In principle, it would be possible to use vect_adjust_loop_lens_control
3090      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3091      However:
3092
3093      (1) In practice, it only makes sense to use SELECT_VL when a vector
3094          operation will be controlled directly by the result.  It is not
3095          worth using SELECT_VL if it would only be the input to other
3096          calculations.
3097
3098      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3099          pointer IV will need N updates by a variable amount (N-1 updates
3100          within the iteration and 1 update to move to the next iteration).
3101
3102      Because of this, we prefer to use the MIN_EXPR approach whenever there
3103      is more than one length control.
3104
3105      In addition, SELECT_VL always operates to a granularity of 1 unit.
3106      If we wanted to use it to control an SLP operation on N consecutive
3107      elements, we would need to make the SELECT_VL inputs measure scalar
3108      iterations (rather than elements) and then multiply the SELECT_VL
3109      result by N.  But using SELECT_VL this way is inefficient because
3110      of (1) above.
3111
3112      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3113         satisfied:
3114
3115      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3116      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3117
3118      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3119      we will fail to gain benefits of following unroll optimizations. We prefer
3120      using the MIN_EXPR approach in this situation.  */
3121   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3122     {
3123       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3124       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3125                                           OPTIMIZE_FOR_SPEED)
3126           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3127           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3128           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3129               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3130         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3131     }
3132
3133   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3134      assuming that the loop will be used as a main loop.  We will redo
3135      this analysis later if we instead decide to use the loop as an
3136      epilogue loop.  */
3137   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3138   if (!ok)
3139     return ok;
3140
3141   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3142      to be able to handle fewer than VF scalars, or needs to have a lower VF
3143      than the main loop.  */
3144   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3145       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3146     {
3147       poly_uint64 unscaled_vf
3148         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3149                      orig_loop_vinfo->suggested_unroll_factor);
3150       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3151         return opt_result::failure_at (vect_location,
3152                                        "Vectorization factor too high for"
3153                                        " epilogue loop.\n");
3154     }
3155
3156   /* Check the costings of the loop make vectorizing worthwhile.  */
3157   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3158   if (res < 0)
3159     {
3160       ok = opt_result::failure_at (vect_location,
3161                                    "Loop costings may not be worthwhile.\n");
3162       goto again;
3163     }
3164   if (!res)
3165     return opt_result::failure_at (vect_location,
3166                                    "Loop costings not worthwhile.\n");
3167
3168   /* If an epilogue loop is required make sure we can create one.  */
3169   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3170       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3171     {
3172       if (dump_enabled_p ())
3173         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3174       if (!vect_can_advance_ivs_p (loop_vinfo)
3175           || !slpeel_can_duplicate_loop_p (loop,
3176                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3177                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3178         {
3179           ok = opt_result::failure_at (vect_location,
3180                                        "not vectorized: can't create required "
3181                                        "epilog loop\n");
3182           goto again;
3183         }
3184     }
3185
3186   /* During peeling, we need to check if number of loop iterations is
3187      enough for both peeled prolog loop and vector loop.  This check
3188      can be merged along with threshold check of loop versioning, so
3189      increase threshold for this case if necessary.
3190
3191      If we are analyzing an epilogue we still want to check what its
3192      versioning threshold would be.  If we decide to vectorize the epilogues we
3193      will want to use the lowest versioning threshold of all epilogues and main
3194      loop.  This will enable us to enter a vectorized epilogue even when
3195      versioning the loop.  We can't simply check whether the epilogue requires
3196      versioning though since we may have skipped some versioning checks when
3197      analyzing the epilogue.  For instance, checks for alias versioning will be
3198      skipped when dealing with epilogues as we assume we already checked them
3199      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3200   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3201     {
3202       poly_uint64 niters_th = 0;
3203       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3204
3205       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3206         {
3207           /* Niters for peeled prolog loop.  */
3208           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3209             {
3210               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3211               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3212               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3213             }
3214           else
3215             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3216         }
3217
3218       /* Niters for at least one iteration of vectorized loop.  */
3219       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3220         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3221       /* One additional iteration because of peeling for gap.  */
3222       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3223         niters_th += 1;
3224
3225       /*  Use the same condition as vect_transform_loop to decide when to use
3226           the cost to determine a versioning threshold.  */
3227       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3228           && ordered_p (th, niters_th))
3229         niters_th = ordered_max (poly_uint64 (th), niters_th);
3230
3231       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3232     }
3233
3234   gcc_assert (known_eq (vectorization_factor,
3235                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3236
3237   slp_done_for_suggested_uf = slp;
3238
3239   /* Ok to vectorize!  */
3240   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3241   return opt_result::success ();
3242
3243 again:
3244   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3245   gcc_assert (!ok);
3246
3247   /* Try again with SLP forced off but if we didn't do any SLP there is
3248      no point in re-trying.  */
3249   if (!slp)
3250     return ok;
3251
3252   /* If the slp decision is true when suggested unroll factor is worked
3253      out, and we are applying suggested unroll factor, we don't need to
3254      re-try any more.  */
3255   if (applying_suggested_uf && slp_done_for_suggested_uf)
3256     return ok;
3257
3258   /* If there are reduction chains re-trying will fail anyway.  */
3259   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3260     return ok;
3261
3262   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3263      via interleaving or lane instructions.  */
3264   slp_instance instance;
3265   slp_tree node;
3266   unsigned i, j;
3267   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3268     {
3269       stmt_vec_info vinfo;
3270       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3271       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3272         continue;
3273       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3274       unsigned int size = DR_GROUP_SIZE (vinfo);
3275       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3276       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3277          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3278          && ! vect_grouped_store_supported (vectype, size))
3279         return opt_result::failure_at (vinfo->stmt,
3280                                        "unsupported grouped store\n");
3281       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3282         {
3283           vinfo = SLP_TREE_REPRESENTATIVE (node);
3284           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3285             {
3286               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3287               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3288               size = DR_GROUP_SIZE (vinfo);
3289               vectype = STMT_VINFO_VECTYPE (vinfo);
3290               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3291                   && ! vect_grouped_load_supported (vectype, single_element_p,
3292                                                     size))
3293                 return opt_result::failure_at (vinfo->stmt,
3294                                                "unsupported grouped load\n");
3295             }
3296         }
3297     }
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "re-trying with SLP disabled\n");
3302
3303   /* Roll back state appropriately.  No SLP this time.  */
3304   slp = false;
3305   /* Restore vectorization factor as it were without SLP.  */
3306   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3307   /* Free the SLP instances.  */
3308   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3309     vect_free_slp_instance (instance);
3310   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3311   /* Reset SLP type to loop_vect on all stmts.  */
3312   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3313     {
3314       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3315       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3316            !gsi_end_p (si); gsi_next (&si))
3317         {
3318           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3319           STMT_SLP_TYPE (stmt_info) = loop_vect;
3320           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3321               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3322             {
3323               /* vectorizable_reduction adjusts reduction stmt def-types,
3324                  restore them to that of the PHI.  */
3325               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3326                 = STMT_VINFO_DEF_TYPE (stmt_info);
3327               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3329                 = STMT_VINFO_DEF_TYPE (stmt_info);
3330             }
3331         }
3332       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3333            !gsi_end_p (si); gsi_next (&si))
3334         {
3335           if (is_gimple_debug (gsi_stmt (si)))
3336             continue;
3337           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3338           STMT_SLP_TYPE (stmt_info) = loop_vect;
3339           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3340             {
3341               stmt_vec_info pattern_stmt_info
3342                 = STMT_VINFO_RELATED_STMT (stmt_info);
3343               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3344                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3345
3346               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3347               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3348               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3349                    !gsi_end_p (pi); gsi_next (&pi))
3350                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3351                   = loop_vect;
3352             }
3353         }
3354     }
3355   /* Free optimized alias test DDRS.  */
3356   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3357   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3358   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3359   /* Reset target cost data.  */
3360   delete loop_vinfo->vector_costs;
3361   loop_vinfo->vector_costs = nullptr;
3362   /* Reset accumulated rgroup information.  */
3363   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3364   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3365   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3366   /* Reset assorted flags.  */
3367   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3368   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3369   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3370   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3371   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3372     = saved_can_use_partial_vectors_p;
3373
3374   goto start_over;
3375 }
3376
3377 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3378    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3379    OLD_LOOP_VINFO is better unless something specifically indicates
3380    otherwise.
3381
3382    Note that this deliberately isn't a partial order.  */
3383
3384 static bool
3385 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3386                           loop_vec_info old_loop_vinfo)
3387 {
3388   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3389   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3390
3391   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3392   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3393
3394   /* Always prefer a VF of loop->simdlen over any other VF.  */
3395   if (loop->simdlen)
3396     {
3397       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3398       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3399       if (new_simdlen_p != old_simdlen_p)
3400         return new_simdlen_p;
3401     }
3402
3403   const auto *old_costs = old_loop_vinfo->vector_costs;
3404   const auto *new_costs = new_loop_vinfo->vector_costs;
3405   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3406     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3407
3408   return new_costs->better_main_loop_than_p (old_costs);
3409 }
3410
3411 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3412    true if we should.  */
3413
3414 static bool
3415 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3416                         loop_vec_info old_loop_vinfo)
3417 {
3418   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3419     return false;
3420
3421   if (dump_enabled_p ())
3422     dump_printf_loc (MSG_NOTE, vect_location,
3423                      "***** Preferring vector mode %s to vector mode %s\n",
3424                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3425                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3426   return true;
3427 }
3428
3429 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3430    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3431    MODE_I to the next mode useful to analyze.
3432    Return the loop_vinfo on success and wrapped null on failure.  */
3433
3434 static opt_loop_vec_info
3435 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3436                      const vect_loop_form_info *loop_form_info,
3437                      loop_vec_info main_loop_vinfo,
3438                      const vector_modes &vector_modes, unsigned &mode_i,
3439                      machine_mode &autodetected_vector_mode,
3440                      bool &fatal)
3441 {
3442   loop_vec_info loop_vinfo
3443     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3444
3445   machine_mode vector_mode = vector_modes[mode_i];
3446   loop_vinfo->vector_mode = vector_mode;
3447   unsigned int suggested_unroll_factor = 1;
3448   bool slp_done_for_suggested_uf = false;
3449
3450   /* Run the main analysis.  */
3451   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3452                                         &suggested_unroll_factor,
3453                                         slp_done_for_suggested_uf);
3454   if (dump_enabled_p ())
3455     dump_printf_loc (MSG_NOTE, vect_location,
3456                      "***** Analysis %s with vector mode %s\n",
3457                      res ? "succeeded" : " failed",
3458                      GET_MODE_NAME (loop_vinfo->vector_mode));
3459
3460   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3461     {
3462       if (dump_enabled_p ())
3463         dump_printf_loc (MSG_NOTE, vect_location,
3464                          "***** Re-trying analysis for unrolling"
3465                          " with unroll factor %d and slp %s.\n",
3466                          suggested_unroll_factor,
3467                          slp_done_for_suggested_uf ? "on" : "off");
3468       loop_vec_info unroll_vinfo
3469         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3470       unroll_vinfo->vector_mode = vector_mode;
3471       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3472       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3473                                                 slp_done_for_suggested_uf);
3474       if (new_res)
3475         {
3476           delete loop_vinfo;
3477           loop_vinfo = unroll_vinfo;
3478         }
3479       else
3480         delete unroll_vinfo;
3481     }
3482
3483   /* Remember the autodetected vector mode.  */
3484   if (vector_mode == VOIDmode)
3485     autodetected_vector_mode = loop_vinfo->vector_mode;
3486
3487   /* Advance mode_i, first skipping modes that would result in the
3488      same analysis result.  */
3489   while (mode_i + 1 < vector_modes.length ()
3490          && vect_chooses_same_modes_p (loop_vinfo,
3491                                        vector_modes[mode_i + 1]))
3492     {
3493       if (dump_enabled_p ())
3494         dump_printf_loc (MSG_NOTE, vect_location,
3495                          "***** The result for vector mode %s would"
3496                          " be the same\n",
3497                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3498       mode_i += 1;
3499     }
3500   if (mode_i + 1 < vector_modes.length ()
3501       && VECTOR_MODE_P (autodetected_vector_mode)
3502       && (related_vector_mode (vector_modes[mode_i + 1],
3503                                GET_MODE_INNER (autodetected_vector_mode))
3504           == autodetected_vector_mode)
3505       && (related_vector_mode (autodetected_vector_mode,
3506                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3507           == vector_modes[mode_i + 1]))
3508     {
3509       if (dump_enabled_p ())
3510         dump_printf_loc (MSG_NOTE, vect_location,
3511                          "***** Skipping vector mode %s, which would"
3512                          " repeat the analysis for %s\n",
3513                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3514                          GET_MODE_NAME (autodetected_vector_mode));
3515       mode_i += 1;
3516     }
3517   mode_i++;
3518
3519   if (!res)
3520     {
3521       delete loop_vinfo;
3522       if (fatal)
3523         gcc_checking_assert (main_loop_vinfo == NULL);
3524       return opt_loop_vec_info::propagate_failure (res);
3525     }
3526
3527   return opt_loop_vec_info::success (loop_vinfo);
3528 }
3529
3530 /* Function vect_analyze_loop.
3531
3532    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3533    for it.  The different analyses will record information in the
3534    loop_vec_info struct.  */
3535 opt_loop_vec_info
3536 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3537 {
3538   DUMP_VECT_SCOPE ("analyze_loop_nest");
3539
3540   if (loop_outer (loop)
3541       && loop_vec_info_for_loop (loop_outer (loop))
3542       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3543     return opt_loop_vec_info::failure_at (vect_location,
3544                                           "outer-loop already vectorized.\n");
3545
3546   if (!find_loop_nest (loop, &shared->loop_nest))
3547     return opt_loop_vec_info::failure_at
3548       (vect_location,
3549        "not vectorized: loop nest containing two or more consecutive inner"
3550        " loops cannot be vectorized\n");
3551
3552   /* Analyze the loop form.  */
3553   vect_loop_form_info loop_form_info;
3554   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3555   if (!res)
3556     {
3557       if (dump_enabled_p ())
3558         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559                          "bad loop form.\n");
3560       return opt_loop_vec_info::propagate_failure (res);
3561     }
3562   if (!integer_onep (loop_form_info.assumptions))
3563     {
3564       /* We consider to vectorize this loop by versioning it under
3565          some assumptions.  In order to do this, we need to clear
3566          existing information computed by scev and niter analyzer.  */
3567       scev_reset_htab ();
3568       free_numbers_of_iterations_estimates (loop);
3569       /* Also set flag for this loop so that following scev and niter
3570          analysis are done under the assumptions.  */
3571       loop_constraint_set (loop, LOOP_C_FINITE);
3572     }
3573   else
3574     /* Clear the existing niter information to make sure the nonwrapping flag
3575        will be calculated and set propriately.  */
3576     free_numbers_of_iterations_estimates (loop);
3577
3578   auto_vector_modes vector_modes;
3579   /* Autodetect first vector size we try.  */
3580   vector_modes.safe_push (VOIDmode);
3581   unsigned int autovec_flags
3582     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3583                                                     loop->simdlen != 0);
3584   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3585                              && !unlimited_cost_model (loop));
3586   machine_mode autodetected_vector_mode = VOIDmode;
3587   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3588   unsigned int mode_i = 0;
3589   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3590
3591   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3592      a mode has not been analyzed.  */
3593   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3594   for (unsigned i = 0; i < vector_modes.length (); ++i)
3595     cached_vf_per_mode.safe_push (0);
3596
3597   /* First determine the main loop vectorization mode, either the first
3598      one that works, starting with auto-detecting the vector mode and then
3599      following the targets order of preference, or the one with the
3600      lowest cost if pick_lowest_cost_p.  */
3601   while (1)
3602     {
3603       bool fatal;
3604       unsigned int last_mode_i = mode_i;
3605       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3606          failed.  */
3607       cached_vf_per_mode[last_mode_i] = -1;
3608       opt_loop_vec_info loop_vinfo
3609         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3610                                NULL, vector_modes, mode_i,
3611                                autodetected_vector_mode, fatal);
3612       if (fatal)
3613         break;
3614
3615       if (loop_vinfo)
3616         {
3617           /*  Analyzis has been successful so update the VF value.  The
3618               VF should always be a multiple of unroll_factor and we want to
3619               capture the original VF here.  */
3620           cached_vf_per_mode[last_mode_i]
3621             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3622                          loop_vinfo->suggested_unroll_factor);
3623           /* Once we hit the desired simdlen for the first time,
3624              discard any previous attempts.  */
3625           if (simdlen
3626               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3627             {
3628               delete first_loop_vinfo;
3629               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3630               simdlen = 0;
3631             }
3632           else if (pick_lowest_cost_p
3633                    && first_loop_vinfo
3634                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3635             {
3636               /* Pick loop_vinfo over first_loop_vinfo.  */
3637               delete first_loop_vinfo;
3638               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3639             }
3640           if (first_loop_vinfo == NULL)
3641             first_loop_vinfo = loop_vinfo;
3642           else
3643             {
3644               delete loop_vinfo;
3645               loop_vinfo = opt_loop_vec_info::success (NULL);
3646             }
3647
3648           /* Commit to first_loop_vinfo if we have no reason to try
3649              alternatives.  */
3650           if (!simdlen && !pick_lowest_cost_p)
3651             break;
3652         }
3653       if (mode_i == vector_modes.length ()
3654           || autodetected_vector_mode == VOIDmode)
3655         break;
3656
3657       /* Try the next biggest vector size.  */
3658       if (dump_enabled_p ())
3659         dump_printf_loc (MSG_NOTE, vect_location,
3660                          "***** Re-trying analysis with vector mode %s\n",
3661                          GET_MODE_NAME (vector_modes[mode_i]));
3662     }
3663   if (!first_loop_vinfo)
3664     return opt_loop_vec_info::propagate_failure (res);
3665
3666   if (dump_enabled_p ())
3667     dump_printf_loc (MSG_NOTE, vect_location,
3668                      "***** Choosing vector mode %s\n",
3669                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3670
3671   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3672      enabled, SIMDUID is not set, it is the innermost loop and we have
3673      either already found the loop's SIMDLEN or there was no SIMDLEN to
3674      begin with.
3675      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3676   bool vect_epilogues = (!simdlen
3677                          && loop->inner == NULL
3678                          && param_vect_epilogues_nomask
3679                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3680                          && !loop->simduid);
3681   if (!vect_epilogues)
3682     return first_loop_vinfo;
3683
3684   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3685   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3686
3687   /* For epilogues start the analysis from the first mode.  The motivation
3688      behind starting from the beginning comes from cases where the VECTOR_MODES
3689      array may contain length-agnostic and length-specific modes.  Their
3690      ordering is not guaranteed, so we could end up picking a mode for the main
3691      loop that is after the epilogue's optimal mode.  */
3692   vector_modes[0] = autodetected_vector_mode;
3693   mode_i = 0;
3694
3695   bool supports_partial_vectors =
3696     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3697   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3698
3699   while (1)
3700     {
3701       /* If the target does not support partial vectors we can shorten the
3702          number of modes to analyze for the epilogue as we know we can't pick a
3703          mode that would lead to a VF at least as big as the
3704          FIRST_VINFO_VF.  */
3705       if (!supports_partial_vectors
3706           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3707         {
3708           mode_i++;
3709           if (mode_i == vector_modes.length ())
3710             break;
3711           continue;
3712         }
3713
3714       if (dump_enabled_p ())
3715         dump_printf_loc (MSG_NOTE, vect_location,
3716                          "***** Re-trying epilogue analysis with vector "
3717                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3718
3719       bool fatal;
3720       opt_loop_vec_info loop_vinfo
3721         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3722                                first_loop_vinfo,
3723                                vector_modes, mode_i,
3724                                autodetected_vector_mode, fatal);
3725       if (fatal)
3726         break;
3727
3728       if (loop_vinfo)
3729         {
3730           if (pick_lowest_cost_p)
3731             {
3732               /* Keep trying to roll back vectorization attempts while the
3733                  loop_vec_infos they produced were worse than this one.  */
3734               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3735               while (!vinfos.is_empty ()
3736                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3737                 {
3738                   gcc_assert (vect_epilogues);
3739                   delete vinfos.pop ();
3740                 }
3741             }
3742           /* For now only allow one epilogue loop.  */
3743           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3744             {
3745               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3746               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3747               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3748                           || maybe_ne (lowest_th, 0U));
3749               /* Keep track of the known smallest versioning
3750                  threshold.  */
3751               if (ordered_p (lowest_th, th))
3752                 lowest_th = ordered_min (lowest_th, th);
3753             }
3754           else
3755             {
3756               delete loop_vinfo;
3757               loop_vinfo = opt_loop_vec_info::success (NULL);
3758             }
3759
3760           /* For now only allow one epilogue loop, but allow
3761              pick_lowest_cost_p to replace it, so commit to the
3762              first epilogue if we have no reason to try alternatives.  */
3763           if (!pick_lowest_cost_p)
3764             break;
3765         }
3766
3767       if (mode_i == vector_modes.length ())
3768         break;
3769
3770     }
3771
3772   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3773     {
3774       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3775       if (dump_enabled_p ())
3776         dump_printf_loc (MSG_NOTE, vect_location,
3777                          "***** Choosing epilogue vector mode %s\n",
3778                          GET_MODE_NAME
3779                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3780     }
3781
3782   return first_loop_vinfo;
3783 }
3784
3785 /* Return true if there is an in-order reduction function for CODE, storing
3786    it in *REDUC_FN if so.  */
3787
3788 static bool
3789 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3790 {
3791   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3792      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3793      (-0.0) = -0.0.  */
3794   if (code == PLUS_EXPR || code == MINUS_EXPR)
3795     {
3796       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3797       return true;
3798     }
3799   return false;
3800 }
3801
3802 /* Function reduction_fn_for_scalar_code
3803
3804    Input:
3805    CODE - tree_code of a reduction operations.
3806
3807    Output:
3808    REDUC_FN - the corresponding internal function to be used to reduce the
3809       vector of partial results into a single scalar result, or IFN_LAST
3810       if the operation is a supported reduction operation, but does not have
3811       such an internal function.
3812
3813    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3814
3815 bool
3816 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3817 {
3818   if (code.is_tree_code ())
3819     switch (tree_code (code))
3820       {
3821       case MAX_EXPR:
3822         *reduc_fn = IFN_REDUC_MAX;
3823         return true;
3824
3825       case MIN_EXPR:
3826         *reduc_fn = IFN_REDUC_MIN;
3827         return true;
3828
3829       case PLUS_EXPR:
3830         *reduc_fn = IFN_REDUC_PLUS;
3831         return true;
3832
3833       case BIT_AND_EXPR:
3834         *reduc_fn = IFN_REDUC_AND;
3835         return true;
3836
3837       case BIT_IOR_EXPR:
3838         *reduc_fn = IFN_REDUC_IOR;
3839         return true;
3840
3841       case BIT_XOR_EXPR:
3842         *reduc_fn = IFN_REDUC_XOR;
3843         return true;
3844
3845       case MULT_EXPR:
3846       case MINUS_EXPR:
3847         *reduc_fn = IFN_LAST;
3848         return true;
3849
3850       default:
3851         return false;
3852       }
3853   else
3854     switch (combined_fn (code))
3855       {
3856       CASE_CFN_FMAX:
3857         *reduc_fn = IFN_REDUC_FMAX;
3858         return true;
3859
3860       CASE_CFN_FMIN:
3861         *reduc_fn = IFN_REDUC_FMIN;
3862         return true;
3863
3864       default:
3865         return false;
3866       }
3867 }
3868
3869 /* If there is a neutral value X such that a reduction would not be affected
3870    by the introduction of additional X elements, return that X, otherwise
3871    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3872    of the scalar elements.  If the reduction has just a single initial value
3873    then INITIAL_VALUE is that value, otherwise it is null.
3874    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3875    In that case no signed zero is returned.  */
3876
3877 tree
3878 neutral_op_for_reduction (tree scalar_type, code_helper code,
3879                           tree initial_value, bool as_initial)
3880 {
3881   if (code.is_tree_code ())
3882     switch (tree_code (code))
3883       {
3884       case DOT_PROD_EXPR:
3885       case SAD_EXPR:
3886       case MINUS_EXPR:
3887       case BIT_IOR_EXPR:
3888       case BIT_XOR_EXPR:
3889         return build_zero_cst (scalar_type);
3890       case WIDEN_SUM_EXPR:
3891       case PLUS_EXPR:
3892         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3893           return build_real (scalar_type, dconstm0);
3894         else
3895           return build_zero_cst (scalar_type);
3896
3897       case MULT_EXPR:
3898         return build_one_cst (scalar_type);
3899
3900       case BIT_AND_EXPR:
3901         return build_all_ones_cst (scalar_type);
3902
3903       case MAX_EXPR:
3904       case MIN_EXPR:
3905         return initial_value;
3906
3907       default:
3908         return NULL_TREE;
3909       }
3910   else
3911     switch (combined_fn (code))
3912       {
3913       CASE_CFN_FMIN:
3914       CASE_CFN_FMAX:
3915         return initial_value;
3916
3917       default:
3918         return NULL_TREE;
3919       }
3920 }
3921
3922 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3923    STMT is printed with a message MSG. */
3924
3925 static void
3926 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3927 {
3928   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3929 }
3930
3931 /* Return true if we need an in-order reduction for operation CODE
3932    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3933    overflow must wrap.  */
3934
3935 bool
3936 needs_fold_left_reduction_p (tree type, code_helper code)
3937 {
3938   /* CHECKME: check for !flag_finite_math_only too?  */
3939   if (SCALAR_FLOAT_TYPE_P (type))
3940     {
3941       if (code.is_tree_code ())
3942         switch (tree_code (code))
3943           {
3944           case MIN_EXPR:
3945           case MAX_EXPR:
3946             return false;
3947
3948           default:
3949             return !flag_associative_math;
3950           }
3951       else
3952         switch (combined_fn (code))
3953           {
3954           CASE_CFN_FMIN:
3955           CASE_CFN_FMAX:
3956             return false;
3957
3958           default:
3959             return !flag_associative_math;
3960           }
3961     }
3962
3963   if (INTEGRAL_TYPE_P (type))
3964     return (!code.is_tree_code ()
3965             || !operation_no_trapping_overflow (type, tree_code (code)));
3966
3967   if (SAT_FIXED_POINT_TYPE_P (type))
3968     return true;
3969
3970   return false;
3971 }
3972
3973 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3974    has a handled computation expression.  Store the main reduction
3975    operation in *CODE.  */
3976
3977 static bool
3978 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3979                       tree loop_arg, code_helper *code,
3980                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3981 {
3982   auto_bitmap visited;
3983   tree lookfor = PHI_RESULT (phi);
3984   ssa_op_iter curri;
3985   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3986   while (USE_FROM_PTR (curr) != loop_arg)
3987     curr = op_iter_next_use (&curri);
3988   curri.i = curri.numops;
3989   do
3990     {
3991       path.safe_push (std::make_pair (curri, curr));
3992       tree use = USE_FROM_PTR (curr);
3993       if (use == lookfor)
3994         break;
3995       gimple *def = SSA_NAME_DEF_STMT (use);
3996       if (gimple_nop_p (def)
3997           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3998         {
3999 pop:
4000           do
4001             {
4002               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4003               curri = x.first;
4004               curr = x.second;
4005               do
4006                 curr = op_iter_next_use (&curri);
4007               /* Skip already visited or non-SSA operands (from iterating
4008                  over PHI args).  */
4009               while (curr != NULL_USE_OPERAND_P
4010                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4011                          || ! bitmap_set_bit (visited,
4012                                               SSA_NAME_VERSION
4013                                                 (USE_FROM_PTR (curr)))));
4014             }
4015           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4016           if (curr == NULL_USE_OPERAND_P)
4017             break;
4018         }
4019       else
4020         {
4021           if (gimple_code (def) == GIMPLE_PHI)
4022             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4023           else
4024             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4025           while (curr != NULL_USE_OPERAND_P
4026                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4027                      || ! bitmap_set_bit (visited,
4028                                           SSA_NAME_VERSION
4029                                             (USE_FROM_PTR (curr)))))
4030             curr = op_iter_next_use (&curri);
4031           if (curr == NULL_USE_OPERAND_P)
4032             goto pop;
4033         }
4034     }
4035   while (1);
4036   if (dump_file && (dump_flags & TDF_DETAILS))
4037     {
4038       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4039       unsigned i;
4040       std::pair<ssa_op_iter, use_operand_p> *x;
4041       FOR_EACH_VEC_ELT (path, i, x)
4042         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4043       dump_printf (MSG_NOTE, "\n");
4044     }
4045
4046   /* Check whether the reduction path detected is valid.  */
4047   bool fail = path.length () == 0;
4048   bool neg = false;
4049   int sign = -1;
4050   *code = ERROR_MARK;
4051   for (unsigned i = 1; i < path.length (); ++i)
4052     {
4053       gimple *use_stmt = USE_STMT (path[i].second);
4054       gimple_match_op op;
4055       if (!gimple_extract_op (use_stmt, &op))
4056         {
4057           fail = true;
4058           break;
4059         }
4060       unsigned int opi = op.num_ops;
4061       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4062         {
4063           /* The following make sure we can compute the operand index
4064              easily plus it mostly disallows chaining via COND_EXPR condition
4065              operands.  */
4066           for (opi = 0; opi < op.num_ops; ++opi)
4067             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4068               break;
4069         }
4070       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4071         {
4072           for (opi = 0; opi < op.num_ops; ++opi)
4073             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4074               break;
4075         }
4076       if (opi == op.num_ops)
4077         {
4078           fail = true;
4079           break;
4080         }
4081       op.code = canonicalize_code (op.code, op.type);
4082       if (op.code == MINUS_EXPR)
4083         {
4084           op.code = PLUS_EXPR;
4085           /* Track whether we negate the reduction value each iteration.  */
4086           if (op.ops[1] == op.ops[opi])
4087             neg = ! neg;
4088         }
4089       if (CONVERT_EXPR_CODE_P (op.code)
4090           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4091         ;
4092       else if (*code == ERROR_MARK)
4093         {
4094           *code = op.code;
4095           sign = TYPE_SIGN (op.type);
4096         }
4097       else if (op.code != *code)
4098         {
4099           fail = true;
4100           break;
4101         }
4102       else if ((op.code == MIN_EXPR
4103                 || op.code == MAX_EXPR)
4104                && sign != TYPE_SIGN (op.type))
4105         {
4106           fail = true;
4107           break;
4108         }
4109       /* Check there's only a single stmt the op is used on.  For the
4110          not value-changing tail and the last stmt allow out-of-loop uses.
4111          ???  We could relax this and handle arbitrary live stmts by
4112          forcing a scalar epilogue for example.  */
4113       imm_use_iterator imm_iter;
4114       use_operand_p use_p;
4115       gimple *op_use_stmt;
4116       unsigned cnt = 0;
4117       bool cond_fn_p = op.code.is_internal_fn ()
4118         && (conditional_internal_fn_code (internal_fn (op.code))
4119             != ERROR_MARK);
4120
4121       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4122         {
4123         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4124            op1 twice (once as definition, once as else) in the same operation.
4125            Allow this.  */
4126           if (cond_fn_p && op_use_stmt == use_stmt)
4127             {
4128               gcall *call = as_a<gcall *> (use_stmt);
4129               unsigned else_pos
4130                 = internal_fn_else_index (internal_fn (op.code));
4131
4132               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4133                 {
4134                   if (j == else_pos)
4135                     continue;
4136                   if (gimple_call_arg (call, j) == op.ops[opi])
4137                     cnt++;
4138                 }
4139             }
4140           else if (!is_gimple_debug (op_use_stmt)
4141                    && (*code != ERROR_MARK
4142                        || flow_bb_inside_loop_p (loop,
4143                                                  gimple_bb (op_use_stmt))))
4144             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4145               cnt++;
4146         }
4147
4148       if (cnt != 1)
4149         {
4150           fail = true;
4151           break;
4152         }
4153     }
4154   return ! fail && ! neg && *code != ERROR_MARK;
4155 }
4156
4157 bool
4158 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4159                       tree loop_arg, enum tree_code code)
4160 {
4161   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4162   code_helper code_;
4163   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4164           && code_ == code);
4165 }
4166
4167
4168
4169 /* Function vect_is_simple_reduction
4170
4171    (1) Detect a cross-iteration def-use cycle that represents a simple
4172    reduction computation.  We look for the following pattern:
4173
4174    loop_header:
4175      a1 = phi < a0, a2 >
4176      a3 = ...
4177      a2 = operation (a3, a1)
4178
4179    or
4180
4181    a3 = ...
4182    loop_header:
4183      a1 = phi < a0, a2 >
4184      a2 = operation (a3, a1)
4185
4186    such that:
4187    1. operation is commutative and associative and it is safe to
4188       change the order of the computation
4189    2. no uses for a2 in the loop (a2 is used out of the loop)
4190    3. no uses of a1 in the loop besides the reduction operation
4191    4. no uses of a1 outside the loop.
4192
4193    Conditions 1,4 are tested here.
4194    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4195
4196    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4197    nested cycles.
4198
4199    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4200    reductions:
4201
4202      a1 = phi < a0, a2 >
4203      inner loop (def of a3)
4204      a2 = phi < a3 >
4205
4206    (4) Detect condition expressions, ie:
4207      for (int i = 0; i < N; i++)
4208        if (a[i] < val)
4209         ret_val = a[i];
4210
4211 */
4212
4213 static stmt_vec_info
4214 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4215                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4216 {
4217   gphi *phi = as_a <gphi *> (phi_info->stmt);
4218   gimple *phi_use_stmt = NULL;
4219   imm_use_iterator imm_iter;
4220   use_operand_p use_p;
4221
4222   *double_reduc = false;
4223   *reduc_chain_p = false;
4224   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4225
4226   tree phi_name = PHI_RESULT (phi);
4227   /* ???  If there are no uses of the PHI result the inner loop reduction
4228      won't be detected as possibly double-reduction by vectorizable_reduction
4229      because that tries to walk the PHI arg from the preheader edge which
4230      can be constant.  See PR60382.  */
4231   if (has_zero_uses (phi_name))
4232     return NULL;
4233   class loop *loop = (gimple_bb (phi))->loop_father;
4234   unsigned nphi_def_loop_uses = 0;
4235   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4236     {
4237       gimple *use_stmt = USE_STMT (use_p);
4238       if (is_gimple_debug (use_stmt))
4239         continue;
4240
4241       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4242         {
4243           if (dump_enabled_p ())
4244             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4245                              "intermediate value used outside loop.\n");
4246
4247           return NULL;
4248         }
4249
4250       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4251          op1 twice (once as definition, once as else) in the same operation.
4252          Only count it as one. */
4253       if (use_stmt != phi_use_stmt)
4254         {
4255           nphi_def_loop_uses++;
4256           phi_use_stmt = use_stmt;
4257         }
4258     }
4259
4260   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4261   if (TREE_CODE (latch_def) != SSA_NAME)
4262     {
4263       if (dump_enabled_p ())
4264         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265                          "reduction: not ssa_name: %T\n", latch_def);
4266       return NULL;
4267     }
4268
4269   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4270   if (!def_stmt_info
4271       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4272     return NULL;
4273
4274   bool nested_in_vect_loop
4275     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4276   unsigned nlatch_def_loop_uses = 0;
4277   auto_vec<gphi *, 3> lcphis;
4278   bool inner_loop_of_double_reduc = false;
4279   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4280     {
4281       gimple *use_stmt = USE_STMT (use_p);
4282       if (is_gimple_debug (use_stmt))
4283         continue;
4284       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4285         nlatch_def_loop_uses++;
4286       else
4287         {
4288           /* We can have more than one loop-closed PHI.  */
4289           lcphis.safe_push (as_a <gphi *> (use_stmt));
4290           if (nested_in_vect_loop
4291               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4292                   == vect_double_reduction_def))
4293             inner_loop_of_double_reduc = true;
4294         }
4295     }
4296
4297   /* If we are vectorizing an inner reduction we are executing that
4298      in the original order only in case we are not dealing with a
4299      double reduction.  */
4300   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4301     {
4302       if (dump_enabled_p ())
4303         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4304                         "detected nested cycle: ");
4305       return def_stmt_info;
4306     }
4307
4308   /* When the inner loop of a double reduction ends up with more than
4309      one loop-closed PHI we have failed to classify alternate such
4310      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4311   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4312     {
4313       if (dump_enabled_p ())
4314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4315                          "unhandle double reduction\n");
4316       return NULL;
4317     }
4318
4319   /* If this isn't a nested cycle or if the nested cycle reduction value
4320      is used ouside of the inner loop we cannot handle uses of the reduction
4321      value.  */
4322   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4323     {
4324       if (dump_enabled_p ())
4325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4326                          "reduction used in loop.\n");
4327       return NULL;
4328     }
4329
4330   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4331      defined in the inner loop.  */
4332   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4333     {
4334       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4335       if (gimple_phi_num_args (def_stmt) != 1
4336           || TREE_CODE (op1) != SSA_NAME)
4337         {
4338           if (dump_enabled_p ())
4339             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4340                              "unsupported phi node definition.\n");
4341
4342           return NULL;
4343         }
4344
4345       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4346          and the latch definition op1.  */
4347       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4348       if (gimple_bb (def1)
4349           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4350           && loop->inner
4351           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4352           && (is_gimple_assign (def1) || is_gimple_call (def1))
4353           && is_a <gphi *> (phi_use_stmt)
4354           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4355           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4356                                             loop_latch_edge (loop->inner))))
4357         {
4358           if (dump_enabled_p ())
4359             report_vect_op (MSG_NOTE, def_stmt,
4360                             "detected double reduction: ");
4361
4362           *double_reduc = true;
4363           return def_stmt_info;
4364         }
4365
4366       return NULL;
4367     }
4368
4369   /* Look for the expression computing latch_def from then loop PHI result.  */
4370   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4371   code_helper code;
4372   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4373                             path))
4374     {
4375       STMT_VINFO_REDUC_CODE (phi_info) = code;
4376       if (code == COND_EXPR && !nested_in_vect_loop)
4377         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4378
4379       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4380          reduction chain for which the additional restriction is that
4381          all operations in the chain are the same.  */
4382       auto_vec<stmt_vec_info, 8> reduc_chain;
4383       unsigned i;
4384       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4385       for (i = path.length () - 1; i >= 1; --i)
4386         {
4387           gimple *stmt = USE_STMT (path[i].second);
4388           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4389           gimple_match_op op;
4390           if (!gimple_extract_op (stmt, &op))
4391             gcc_unreachable ();
4392           if (gassign *assign = dyn_cast<gassign *> (stmt))
4393             STMT_VINFO_REDUC_IDX (stmt_info)
4394               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4395           else
4396             {
4397               gcall *call = as_a<gcall *> (stmt);
4398               STMT_VINFO_REDUC_IDX (stmt_info)
4399                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4400             }
4401           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4402                                      && (i == 1 || i == path.length () - 1));
4403           if ((op.code != code && !leading_conversion)
4404               /* We can only handle the final value in epilogue
4405                  generation for reduction chains.  */
4406               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4407             is_slp_reduc = false;
4408           /* For reduction chains we support a trailing/leading
4409              conversions.  We do not store those in the actual chain.  */
4410           if (leading_conversion)
4411             continue;
4412           reduc_chain.safe_push (stmt_info);
4413         }
4414       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4415         {
4416           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4417             {
4418               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4419               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4420             }
4421           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4422           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4423
4424           /* Save the chain for further analysis in SLP detection.  */
4425           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4426           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4427
4428           *reduc_chain_p = true;
4429           if (dump_enabled_p ())
4430             dump_printf_loc (MSG_NOTE, vect_location,
4431                             "reduction: detected reduction chain\n");
4432         }
4433       else if (dump_enabled_p ())
4434         dump_printf_loc (MSG_NOTE, vect_location,
4435                          "reduction: detected reduction\n");
4436
4437       return def_stmt_info;
4438     }
4439
4440   if (dump_enabled_p ())
4441     dump_printf_loc (MSG_NOTE, vect_location,
4442                      "reduction: unknown pattern\n");
4443
4444   return NULL;
4445 }
4446
4447 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4448    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4449    or -1 if not known.  */
4450
4451 static int
4452 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4453 {
4454   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4455   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4456     {
4457       if (dump_enabled_p ())
4458         dump_printf_loc (MSG_NOTE, vect_location,
4459                          "cost model: epilogue peel iters set to vf/2 "
4460                          "because loop iterations are unknown .\n");
4461       return assumed_vf / 2;
4462     }
4463   else
4464     {
4465       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4466       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4467       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4468       /* If we need to peel for gaps, but no peeling is required, we have to
4469          peel VF iterations.  */
4470       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4471         peel_iters_epilogue = assumed_vf;
4472       return peel_iters_epilogue;
4473     }
4474 }
4475
4476 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4477 int
4478 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4479                              int *peel_iters_epilogue,
4480                              stmt_vector_for_cost *scalar_cost_vec,
4481                              stmt_vector_for_cost *prologue_cost_vec,
4482                              stmt_vector_for_cost *epilogue_cost_vec)
4483 {
4484   int retval = 0;
4485
4486   *peel_iters_epilogue
4487     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4488
4489   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4490     {
4491       /* If peeled iterations are known but number of scalar loop
4492          iterations are unknown, count a taken branch per peeled loop.  */
4493       if (peel_iters_prologue > 0)
4494         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4495                                    vect_prologue);
4496       if (*peel_iters_epilogue > 0)
4497         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4498                                     vect_epilogue);
4499     }
4500
4501   stmt_info_for_cost *si;
4502   int j;
4503   if (peel_iters_prologue)
4504     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4505       retval += record_stmt_cost (prologue_cost_vec,
4506                                   si->count * peel_iters_prologue,
4507                                   si->kind, si->stmt_info, si->misalign,
4508                                   vect_prologue);
4509   if (*peel_iters_epilogue)
4510     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4511       retval += record_stmt_cost (epilogue_cost_vec,
4512                                   si->count * *peel_iters_epilogue,
4513                                   si->kind, si->stmt_info, si->misalign,
4514                                   vect_epilogue);
4515
4516   return retval;
4517 }
4518
4519 /* Function vect_estimate_min_profitable_iters
4520
4521    Return the number of iterations required for the vector version of the
4522    loop to be profitable relative to the cost of the scalar version of the
4523    loop.
4524
4525    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4526    of iterations for vectorization.  -1 value means loop vectorization
4527    is not profitable.  This returned value may be used for dynamic
4528    profitability check.
4529
4530    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4531    for static check against estimated number of iterations.  */
4532
4533 static void
4534 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4535                                     int *ret_min_profitable_niters,
4536                                     int *ret_min_profitable_estimate,
4537                                     unsigned *suggested_unroll_factor)
4538 {
4539   int min_profitable_iters;
4540   int min_profitable_estimate;
4541   int peel_iters_prologue;
4542   int peel_iters_epilogue;
4543   unsigned vec_inside_cost = 0;
4544   int vec_outside_cost = 0;
4545   unsigned vec_prologue_cost = 0;
4546   unsigned vec_epilogue_cost = 0;
4547   int scalar_single_iter_cost = 0;
4548   int scalar_outside_cost = 0;
4549   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4550   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4551   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4552
4553   /* Cost model disabled.  */
4554   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4555     {
4556       if (dump_enabled_p ())
4557         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4558       *ret_min_profitable_niters = 0;
4559       *ret_min_profitable_estimate = 0;
4560       return;
4561     }
4562
4563   /* Requires loop versioning tests to handle misalignment.  */
4564   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4565     {
4566       /*  FIXME: Make cost depend on complexity of individual check.  */
4567       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4568       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4569       if (dump_enabled_p ())
4570         dump_printf (MSG_NOTE,
4571                      "cost model: Adding cost of checks for loop "
4572                      "versioning to treat misalignment.\n");
4573     }
4574
4575   /* Requires loop versioning with alias checks.  */
4576   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4577     {
4578       /*  FIXME: Make cost depend on complexity of individual check.  */
4579       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4580       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4581       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4582       if (len)
4583         /* Count LEN - 1 ANDs and LEN comparisons.  */
4584         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4585                               scalar_stmt, vect_prologue);
4586       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4587       if (len)
4588         {
4589           /* Count LEN - 1 ANDs and LEN comparisons.  */
4590           unsigned int nstmts = len * 2 - 1;
4591           /* +1 for each bias that needs adding.  */
4592           for (unsigned int i = 0; i < len; ++i)
4593             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4594               nstmts += 1;
4595           (void) add_stmt_cost (target_cost_data, nstmts,
4596                                 scalar_stmt, vect_prologue);
4597         }
4598       if (dump_enabled_p ())
4599         dump_printf (MSG_NOTE,
4600                      "cost model: Adding cost of checks for loop "
4601                      "versioning aliasing.\n");
4602     }
4603
4604   /* Requires loop versioning with niter checks.  */
4605   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4606     {
4607       /*  FIXME: Make cost depend on complexity of individual check.  */
4608       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4609                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4610       if (dump_enabled_p ())
4611         dump_printf (MSG_NOTE,
4612                      "cost model: Adding cost of checks for loop "
4613                      "versioning niters.\n");
4614     }
4615
4616   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4617     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4618                           vect_prologue);
4619
4620   /* Count statements in scalar loop.  Using this as scalar cost for a single
4621      iteration for now.
4622
4623      TODO: Add outer loop support.
4624
4625      TODO: Consider assigning different costs to different scalar
4626      statements.  */
4627
4628   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4629
4630   /* Add additional cost for the peeled instructions in prologue and epilogue
4631      loop.  (For fully-masked loops there will be no peeling.)
4632
4633      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4634      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4635
4636      TODO: Build an expression that represents peel_iters for prologue and
4637      epilogue to be used in a run-time test.  */
4638
4639   bool prologue_need_br_taken_cost = false;
4640   bool prologue_need_br_not_taken_cost = false;
4641
4642   /* Calculate peel_iters_prologue.  */
4643   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4644     peel_iters_prologue = 0;
4645   else if (npeel < 0)
4646     {
4647       peel_iters_prologue = assumed_vf / 2;
4648       if (dump_enabled_p ())
4649         dump_printf (MSG_NOTE, "cost model: "
4650                      "prologue peel iters set to vf/2.\n");
4651
4652       /* If peeled iterations are unknown, count a taken branch and a not taken
4653          branch per peeled loop.  Even if scalar loop iterations are known,
4654          vector iterations are not known since peeled prologue iterations are
4655          not known.  Hence guards remain the same.  */
4656       prologue_need_br_taken_cost = true;
4657       prologue_need_br_not_taken_cost = true;
4658     }
4659   else
4660     {
4661       peel_iters_prologue = npeel;
4662       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4663         /* If peeled iterations are known but number of scalar loop
4664            iterations are unknown, count a taken branch per peeled loop.  */
4665         prologue_need_br_taken_cost = true;
4666     }
4667
4668   bool epilogue_need_br_taken_cost = false;
4669   bool epilogue_need_br_not_taken_cost = false;
4670
4671   /* Calculate peel_iters_epilogue.  */
4672   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4673     /* We need to peel exactly one iteration for gaps.  */
4674     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4675   else if (npeel < 0)
4676     {
4677       /* If peeling for alignment is unknown, loop bound of main loop
4678          becomes unknown.  */
4679       peel_iters_epilogue = assumed_vf / 2;
4680       if (dump_enabled_p ())
4681         dump_printf (MSG_NOTE, "cost model: "
4682                      "epilogue peel iters set to vf/2 because "
4683                      "peeling for alignment is unknown.\n");
4684
4685       /* See the same reason above in peel_iters_prologue calculation.  */
4686       epilogue_need_br_taken_cost = true;
4687       epilogue_need_br_not_taken_cost = true;
4688     }
4689   else
4690     {
4691       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4692       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4693         /* If peeled iterations are known but number of scalar loop
4694            iterations are unknown, count a taken branch per peeled loop.  */
4695         epilogue_need_br_taken_cost = true;
4696     }
4697
4698   stmt_info_for_cost *si;
4699   int j;
4700   /* Add costs associated with peel_iters_prologue.  */
4701   if (peel_iters_prologue)
4702     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4703       {
4704         (void) add_stmt_cost (target_cost_data,
4705                               si->count * peel_iters_prologue, si->kind,
4706                               si->stmt_info, si->node, si->vectype,
4707                               si->misalign, vect_prologue);
4708       }
4709
4710   /* Add costs associated with peel_iters_epilogue.  */
4711   if (peel_iters_epilogue)
4712     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4713       {
4714         (void) add_stmt_cost (target_cost_data,
4715                               si->count * peel_iters_epilogue, si->kind,
4716                               si->stmt_info, si->node, si->vectype,
4717                               si->misalign, vect_epilogue);
4718       }
4719
4720   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4721
4722   if (prologue_need_br_taken_cost)
4723     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4724                           vect_prologue);
4725
4726   if (prologue_need_br_not_taken_cost)
4727     (void) add_stmt_cost (target_cost_data, 1,
4728                           cond_branch_not_taken, vect_prologue);
4729
4730   if (epilogue_need_br_taken_cost)
4731     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4732                           vect_epilogue);
4733
4734   if (epilogue_need_br_not_taken_cost)
4735     (void) add_stmt_cost (target_cost_data, 1,
4736                           cond_branch_not_taken, vect_epilogue);
4737
4738   /* Take care of special costs for rgroup controls of partial vectors.  */
4739   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4740       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4741           == vect_partial_vectors_avx512))
4742     {
4743       /* Calculate how many masks we need to generate.  */
4744       unsigned int num_masks = 0;
4745       bool need_saturation = false;
4746       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4747         if (rgm.type)
4748           {
4749             unsigned nvectors = rgm.factor;
4750             num_masks += nvectors;
4751             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4752                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4753               need_saturation = true;
4754           }
4755
4756       /* ???  The target isn't able to identify the costs below as
4757          producing masks so it cannot penaltize cases where we'd run
4758          out of mask registers for example.  */
4759
4760       /* ???  We are also failing to account for smaller vector masks
4761          we generate by splitting larger masks in vect_get_loop_mask.  */
4762
4763       /* In the worst case, we need to generate each mask in the prologue
4764          and in the loop body.  We need one splat per group and one
4765          compare per mask.
4766
4767          Sometimes the prologue mask will fold to a constant,
4768          so the actual prologue cost might be smaller.  However, it's
4769          simpler and safer to use the worst-case cost; if this ends up
4770          being the tie-breaker between vectorizing or not, then it's
4771          probably better not to vectorize.  */
4772       (void) add_stmt_cost (target_cost_data,
4773                             num_masks
4774                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4775                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4776                             vect_prologue);
4777       (void) add_stmt_cost (target_cost_data,
4778                             num_masks
4779                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4780                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4781
4782       /* When we need saturation we need it both in the prologue and
4783          the epilogue.  */
4784       if (need_saturation)
4785         {
4786           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4787                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4788           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4789                                 NULL, NULL, NULL_TREE, 0, vect_body);
4790         }
4791     }
4792   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4793            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4794                == vect_partial_vectors_while_ult))
4795     {
4796       /* Calculate how many masks we need to generate.  */
4797       unsigned int num_masks = 0;
4798       rgroup_controls *rgm;
4799       unsigned int num_vectors_m1;
4800       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4801                         num_vectors_m1, rgm)
4802         if (rgm->type)
4803           num_masks += num_vectors_m1 + 1;
4804       gcc_assert (num_masks > 0);
4805
4806       /* In the worst case, we need to generate each mask in the prologue
4807          and in the loop body.  One of the loop body mask instructions
4808          replaces the comparison in the scalar loop, and since we don't
4809          count the scalar comparison against the scalar body, we shouldn't
4810          count that vector instruction against the vector body either.
4811
4812          Sometimes we can use unpacks instead of generating prologue
4813          masks and sometimes the prologue mask will fold to a constant,
4814          so the actual prologue cost might be smaller.  However, it's
4815          simpler and safer to use the worst-case cost; if this ends up
4816          being the tie-breaker between vectorizing or not, then it's
4817          probably better not to vectorize.  */
4818       (void) add_stmt_cost (target_cost_data, num_masks,
4819                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4820                             vect_prologue);
4821       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4822                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4823                             vect_body);
4824     }
4825   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4826     {
4827       /* Referring to the functions vect_set_loop_condition_partial_vectors
4828          and vect_set_loop_controls_directly, we need to generate each
4829          length in the prologue and in the loop body if required. Although
4830          there are some possible optimizations, we consider the worst case
4831          here.  */
4832
4833       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4834       signed char partial_load_store_bias
4835         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4836       bool need_iterate_p
4837         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4838            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4839
4840       /* Calculate how many statements to be added.  */
4841       unsigned int prologue_stmts = 0;
4842       unsigned int body_stmts = 0;
4843
4844       rgroup_controls *rgc;
4845       unsigned int num_vectors_m1;
4846       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4847         if (rgc->type)
4848           {
4849             /* May need one SHIFT for nitems_total computation.  */
4850             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4851             if (nitems != 1 && !niters_known_p)
4852               prologue_stmts += 1;
4853
4854             /* May need one MAX and one MINUS for wrap around.  */
4855             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4856               prologue_stmts += 2;
4857
4858             /* Need one MAX and one MINUS for each batch limit excepting for
4859                the 1st one.  */
4860             prologue_stmts += num_vectors_m1 * 2;
4861
4862             unsigned int num_vectors = num_vectors_m1 + 1;
4863
4864             /* Need to set up lengths in prologue, only one MIN required
4865                for each since start index is zero.  */
4866             prologue_stmts += num_vectors;
4867
4868             /* If we have a non-zero partial load bias, we need one PLUS
4869                to adjust the load length.  */
4870             if (partial_load_store_bias != 0)
4871               body_stmts += 1;
4872
4873             /* Each may need two MINs and one MINUS to update lengths in body
4874                for next iteration.  */
4875             if (need_iterate_p)
4876               body_stmts += 3 * num_vectors;
4877           }
4878
4879       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4880                             scalar_stmt, vect_prologue);
4881       (void) add_stmt_cost (target_cost_data, body_stmts,
4882                             scalar_stmt, vect_body);
4883     }
4884
4885   /* FORNOW: The scalar outside cost is incremented in one of the
4886      following ways:
4887
4888      1. The vectorizer checks for alignment and aliasing and generates
4889      a condition that allows dynamic vectorization.  A cost model
4890      check is ANDED with the versioning condition.  Hence scalar code
4891      path now has the added cost of the versioning check.
4892
4893        if (cost > th & versioning_check)
4894          jmp to vector code
4895
4896      Hence run-time scalar is incremented by not-taken branch cost.
4897
4898      2. The vectorizer then checks if a prologue is required.  If the
4899      cost model check was not done before during versioning, it has to
4900      be done before the prologue check.
4901
4902        if (cost <= th)
4903          prologue = scalar_iters
4904        if (prologue == 0)
4905          jmp to vector code
4906        else
4907          execute prologue
4908        if (prologue == num_iters)
4909          go to exit
4910
4911      Hence the run-time scalar cost is incremented by a taken branch,
4912      plus a not-taken branch, plus a taken branch cost.
4913
4914      3. The vectorizer then checks if an epilogue is required.  If the
4915      cost model check was not done before during prologue check, it
4916      has to be done with the epilogue check.
4917
4918        if (prologue == 0)
4919          jmp to vector code
4920        else
4921          execute prologue
4922        if (prologue == num_iters)
4923          go to exit
4924        vector code:
4925          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4926            jmp to epilogue
4927
4928      Hence the run-time scalar cost should be incremented by 2 taken
4929      branches.
4930
4931      TODO: The back end may reorder the BBS's differently and reverse
4932      conditions/branch directions.  Change the estimates below to
4933      something more reasonable.  */
4934
4935   /* If the number of iterations is known and we do not do versioning, we can
4936      decide whether to vectorize at compile time.  Hence the scalar version
4937      do not carry cost model guard costs.  */
4938   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4939       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4940     {
4941       /* Cost model check occurs at versioning.  */
4942       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4943         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4944       else
4945         {
4946           /* Cost model check occurs at prologue generation.  */
4947           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4948             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4949               + vect_get_stmt_cost (cond_branch_not_taken);
4950           /* Cost model check occurs at epilogue generation.  */
4951           else
4952             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4953         }
4954     }
4955
4956   /* Complete the target-specific cost calculations.  */
4957   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4958                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4959                suggested_unroll_factor);
4960
4961   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4962       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4963       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4964                     *suggested_unroll_factor,
4965                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4966     {
4967       if (dump_enabled_p ())
4968         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4969                          "can't unroll as unrolled vectorization factor larger"
4970                          " than maximum vectorization factor: "
4971                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4972                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4973       *suggested_unroll_factor = 1;
4974     }
4975
4976   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4977
4978   if (dump_enabled_p ())
4979     {
4980       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4981       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4982                    vec_inside_cost);
4983       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4984                    vec_prologue_cost);
4985       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4986                    vec_epilogue_cost);
4987       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4988                    scalar_single_iter_cost);
4989       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4990                    scalar_outside_cost);
4991       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4992                    vec_outside_cost);
4993       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4994                    peel_iters_prologue);
4995       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4996                    peel_iters_epilogue);
4997     }
4998
4999   /* Calculate number of iterations required to make the vector version
5000      profitable, relative to the loop bodies only.  The following condition
5001      must hold true:
5002      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5003      where
5004      SIC = scalar iteration cost, VIC = vector iteration cost,
5005      VOC = vector outside cost, VF = vectorization factor,
5006      NPEEL = prologue iterations + epilogue iterations,
5007      SOC = scalar outside cost for run time cost model check.  */
5008
5009   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5010                           - vec_inside_cost);
5011   if (saving_per_viter <= 0)
5012     {
5013       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5014         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5015                     "vectorization did not happen for a simd loop");
5016
5017       if (dump_enabled_p ())
5018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019                          "cost model: the vector iteration cost = %d "
5020                          "divided by the scalar iteration cost = %d "
5021                          "is greater or equal to the vectorization factor = %d"
5022                          ".\n",
5023                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5024       *ret_min_profitable_niters = -1;
5025       *ret_min_profitable_estimate = -1;
5026       return;
5027     }
5028
5029   /* ??? The "if" arm is written to handle all cases; see below for what
5030      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5031   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5032     {
5033       /* Rewriting the condition above in terms of the number of
5034          vector iterations (vniters) rather than the number of
5035          scalar iterations (niters) gives:
5036
5037          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5038
5039          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5040
5041          For integer N, X and Y when X > 0:
5042
5043          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5044       int outside_overhead = (vec_outside_cost
5045                               - scalar_single_iter_cost * peel_iters_prologue
5046                               - scalar_single_iter_cost * peel_iters_epilogue
5047                               - scalar_outside_cost);
5048       /* We're only interested in cases that require at least one
5049          vector iteration.  */
5050       int min_vec_niters = 1;
5051       if (outside_overhead > 0)
5052         min_vec_niters = outside_overhead / saving_per_viter + 1;
5053
5054       if (dump_enabled_p ())
5055         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5056                      min_vec_niters);
5057
5058       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5059         {
5060           /* Now that we know the minimum number of vector iterations,
5061              find the minimum niters for which the scalar cost is larger:
5062
5063              SIC * niters > VIC * vniters + VOC - SOC
5064
5065              We know that the minimum niters is no more than
5066              vniters * VF + NPEEL, but it might be (and often is) less
5067              than that if a partial vector iteration is cheaper than the
5068              equivalent scalar code.  */
5069           int threshold = (vec_inside_cost * min_vec_niters
5070                            + vec_outside_cost
5071                            - scalar_outside_cost);
5072           if (threshold <= 0)
5073             min_profitable_iters = 1;
5074           else
5075             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5076         }
5077       else
5078         /* Convert the number of vector iterations into a number of
5079            scalar iterations.  */
5080         min_profitable_iters = (min_vec_niters * assumed_vf
5081                                 + peel_iters_prologue
5082                                 + peel_iters_epilogue);
5083     }
5084   else
5085     {
5086       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5087                               * assumed_vf
5088                               - vec_inside_cost * peel_iters_prologue
5089                               - vec_inside_cost * peel_iters_epilogue);
5090       if (min_profitable_iters <= 0)
5091         min_profitable_iters = 0;
5092       else
5093         {
5094           min_profitable_iters /= saving_per_viter;
5095
5096           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5097               <= (((int) vec_inside_cost * min_profitable_iters)
5098                   + (((int) vec_outside_cost - scalar_outside_cost)
5099                      * assumed_vf)))
5100             min_profitable_iters++;
5101         }
5102     }
5103
5104   if (dump_enabled_p ())
5105     dump_printf (MSG_NOTE,
5106                  "  Calculated minimum iters for profitability: %d\n",
5107                  min_profitable_iters);
5108
5109   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5110       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5111     /* We want the vectorized loop to execute at least once.  */
5112     min_profitable_iters = assumed_vf + peel_iters_prologue;
5113   else if (min_profitable_iters < peel_iters_prologue)
5114     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5115        vectorized loop executes at least once.  */
5116     min_profitable_iters = peel_iters_prologue;
5117
5118   if (dump_enabled_p ())
5119     dump_printf_loc (MSG_NOTE, vect_location,
5120                      "  Runtime profitability threshold = %d\n",
5121                      min_profitable_iters);
5122
5123   *ret_min_profitable_niters = min_profitable_iters;
5124
5125   /* Calculate number of iterations required to make the vector version
5126      profitable, relative to the loop bodies only.
5127
5128      Non-vectorized variant is SIC * niters and it must win over vector
5129      variant on the expected loop trip count.  The following condition must hold true:
5130      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5131
5132   if (vec_outside_cost <= 0)
5133     min_profitable_estimate = 0;
5134   /* ??? This "else if" arm is written to handle all cases; see below for
5135      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5136   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5137     {
5138       /* This is a repeat of the code above, but with + SOC rather
5139          than - SOC.  */
5140       int outside_overhead = (vec_outside_cost
5141                               - scalar_single_iter_cost * peel_iters_prologue
5142                               - scalar_single_iter_cost * peel_iters_epilogue
5143                               + scalar_outside_cost);
5144       int min_vec_niters = 1;
5145       if (outside_overhead > 0)
5146         min_vec_niters = outside_overhead / saving_per_viter + 1;
5147
5148       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5149         {
5150           int threshold = (vec_inside_cost * min_vec_niters
5151                            + vec_outside_cost
5152                            + scalar_outside_cost);
5153           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5154         }
5155       else
5156         min_profitable_estimate = (min_vec_niters * assumed_vf
5157                                    + peel_iters_prologue
5158                                    + peel_iters_epilogue);
5159     }
5160   else
5161     {
5162       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5163                                  * assumed_vf
5164                                  - vec_inside_cost * peel_iters_prologue
5165                                  - vec_inside_cost * peel_iters_epilogue)
5166                                  / ((scalar_single_iter_cost * assumed_vf)
5167                                    - vec_inside_cost);
5168     }
5169   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5170   if (dump_enabled_p ())
5171     dump_printf_loc (MSG_NOTE, vect_location,
5172                      "  Static estimate profitability threshold = %d\n",
5173                      min_profitable_estimate);
5174
5175   *ret_min_profitable_estimate = min_profitable_estimate;
5176 }
5177
5178 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5179    vector elements (not bits) for a vector with NELT elements.  */
5180 static void
5181 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5182                               vec_perm_builder *sel)
5183 {
5184   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5185      by vec_perm_indices.  */
5186   sel->new_vector (nelt, 1, 3);
5187   for (unsigned int i = 0; i < 3; i++)
5188     sel->quick_push (i + offset);
5189 }
5190
5191 /* Checks whether the target supports whole-vector shifts for vectors of mode
5192    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5193    it supports vec_perm_const with masks for all necessary shift amounts.  */
5194 static bool
5195 have_whole_vector_shift (machine_mode mode)
5196 {
5197   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5198     return true;
5199
5200   /* Variable-length vectors should be handled via the optab.  */
5201   unsigned int nelt;
5202   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5203     return false;
5204
5205   vec_perm_builder sel;
5206   vec_perm_indices indices;
5207   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5208     {
5209       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5210       indices.new_vector (sel, 2, nelt);
5211       if (!can_vec_perm_const_p (mode, mode, indices, false))
5212         return false;
5213     }
5214   return true;
5215 }
5216
5217 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5218    multiplication operands have differing signs and (b) we intend
5219    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5220    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5221
5222 static bool
5223 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5224                                  stmt_vec_info stmt_info)
5225 {
5226   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5227   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5228     return false;
5229
5230   tree rhs1 = gimple_assign_rhs1 (assign);
5231   tree rhs2 = gimple_assign_rhs2 (assign);
5232   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5233     return false;
5234
5235   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5236   gcc_assert (reduc_info->is_reduc_info);
5237   return !directly_supported_p (DOT_PROD_EXPR,
5238                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5239                                 optab_vector_mixed_sign);
5240 }
5241
5242 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5243    functions. Design better to avoid maintenance issues.  */
5244
5245 /* Function vect_model_reduction_cost.
5246
5247    Models cost for a reduction operation, including the vector ops
5248    generated within the strip-mine loop in some cases, the initial
5249    definition before the loop, and the epilogue code that must be generated.  */
5250
5251 static void
5252 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5253                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5254                            vect_reduction_type reduction_type,
5255                            int ncopies, stmt_vector_for_cost *cost_vec)
5256 {
5257   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5258   tree vectype;
5259   machine_mode mode;
5260   class loop *loop = NULL;
5261
5262   if (loop_vinfo)
5263     loop = LOOP_VINFO_LOOP (loop_vinfo);
5264
5265   /* Condition reductions generate two reductions in the loop.  */
5266   if (reduction_type == COND_REDUCTION)
5267     ncopies *= 2;
5268
5269   vectype = STMT_VINFO_VECTYPE (stmt_info);
5270   mode = TYPE_MODE (vectype);
5271   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5272
5273   gimple_match_op op;
5274   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5275     gcc_unreachable ();
5276
5277   bool emulated_mixed_dot_prod
5278     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5279   if (reduction_type == EXTRACT_LAST_REDUCTION)
5280     /* No extra instructions are needed in the prologue.  The loop body
5281        operations are costed in vectorizable_condition.  */
5282     inside_cost = 0;
5283   else if (reduction_type == FOLD_LEFT_REDUCTION)
5284     {
5285       /* No extra instructions needed in the prologue.  */
5286       prologue_cost = 0;
5287
5288       if (reduc_fn != IFN_LAST)
5289         /* Count one reduction-like operation per vector.  */
5290         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5291                                         stmt_info, 0, vect_body);
5292       else
5293         {
5294           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5295           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5296           inside_cost = record_stmt_cost (cost_vec, nelements,
5297                                           vec_to_scalar, stmt_info, 0,
5298                                           vect_body);
5299           inside_cost += record_stmt_cost (cost_vec, nelements,
5300                                            scalar_stmt, stmt_info, 0,
5301                                            vect_body);
5302         }
5303     }
5304   else
5305     {
5306       /* Add in the cost of the initial definitions.  */
5307       int prologue_stmts;
5308       if (reduction_type == COND_REDUCTION)
5309         /* For cond reductions we have four vectors: initial index, step,
5310            initial result of the data reduction, initial value of the index
5311            reduction.  */
5312         prologue_stmts = 4;
5313       else if (emulated_mixed_dot_prod)
5314         /* We need the initial reduction value and two invariants:
5315            one that contains the minimum signed value and one that
5316            contains half of its negative.  */
5317         prologue_stmts = 3;
5318       else
5319         prologue_stmts = 1;
5320       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5321                                          scalar_to_vec, stmt_info, 0,
5322                                          vect_prologue);
5323     }
5324
5325   /* Determine cost of epilogue code.
5326
5327      We have a reduction operator that will reduce the vector in one statement.
5328      Also requires scalar extract.  */
5329
5330   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5331     {
5332       if (reduc_fn != IFN_LAST)
5333         {
5334           if (reduction_type == COND_REDUCTION)
5335             {
5336               /* An EQ stmt and an COND_EXPR stmt.  */
5337               epilogue_cost += record_stmt_cost (cost_vec, 2,
5338                                                  vector_stmt, stmt_info, 0,
5339                                                  vect_epilogue);
5340               /* Reduction of the max index and a reduction of the found
5341                  values.  */
5342               epilogue_cost += record_stmt_cost (cost_vec, 2,
5343                                                  vec_to_scalar, stmt_info, 0,
5344                                                  vect_epilogue);
5345               /* A broadcast of the max value.  */
5346               epilogue_cost += record_stmt_cost (cost_vec, 1,
5347                                                  scalar_to_vec, stmt_info, 0,
5348                                                  vect_epilogue);
5349             }
5350           else
5351             {
5352               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5353                                                  stmt_info, 0, vect_epilogue);
5354               epilogue_cost += record_stmt_cost (cost_vec, 1,
5355                                                  vec_to_scalar, stmt_info, 0,
5356                                                  vect_epilogue);
5357             }
5358         }
5359       else if (reduction_type == COND_REDUCTION)
5360         {
5361           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5362           /* Extraction of scalar elements.  */
5363           epilogue_cost += record_stmt_cost (cost_vec,
5364                                              2 * estimated_nunits,
5365                                              vec_to_scalar, stmt_info, 0,
5366                                              vect_epilogue);
5367           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5368           epilogue_cost += record_stmt_cost (cost_vec,
5369                                              2 * estimated_nunits - 3,
5370                                              scalar_stmt, stmt_info, 0,
5371                                              vect_epilogue);
5372         }
5373       else if (reduction_type == EXTRACT_LAST_REDUCTION
5374                || reduction_type == FOLD_LEFT_REDUCTION)
5375         /* No extra instructions need in the epilogue.  */
5376         ;
5377       else
5378         {
5379           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5380           tree bitsize = TYPE_SIZE (op.type);
5381           int element_bitsize = tree_to_uhwi (bitsize);
5382           int nelements = vec_size_in_bits / element_bitsize;
5383
5384           if (op.code == COND_EXPR)
5385             op.code = MAX_EXPR;
5386
5387           /* We have a whole vector shift available.  */
5388           if (VECTOR_MODE_P (mode)
5389               && directly_supported_p (op.code, vectype)
5390               && have_whole_vector_shift (mode))
5391             {
5392               /* Final reduction via vector shifts and the reduction operator.
5393                  Also requires scalar extract.  */
5394               epilogue_cost += record_stmt_cost (cost_vec,
5395                                                  exact_log2 (nelements) * 2,
5396                                                  vector_stmt, stmt_info, 0,
5397                                                  vect_epilogue);
5398               epilogue_cost += record_stmt_cost (cost_vec, 1,
5399                                                  vec_to_scalar, stmt_info, 0,
5400                                                  vect_epilogue);
5401             }
5402           else
5403             /* Use extracts and reduction op for final reduction.  For N
5404                elements, we have N extracts and N-1 reduction ops.  */
5405             epilogue_cost += record_stmt_cost (cost_vec,
5406                                                nelements + nelements - 1,
5407                                                vector_stmt, stmt_info, 0,
5408                                                vect_epilogue);
5409         }
5410     }
5411
5412   if (dump_enabled_p ())
5413     dump_printf (MSG_NOTE,
5414                  "vect_model_reduction_cost: inside_cost = %d, "
5415                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5416                  prologue_cost, epilogue_cost);
5417 }
5418
5419 /* SEQ is a sequence of instructions that initialize the reduction
5420    described by REDUC_INFO.  Emit them in the appropriate place.  */
5421
5422 static void
5423 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5424                                 stmt_vec_info reduc_info, gimple *seq)
5425 {
5426   if (reduc_info->reused_accumulator)
5427     {
5428       /* When reusing an accumulator from the main loop, we only need
5429          initialization instructions if the main loop can be skipped.
5430          In that case, emit the initialization instructions at the end
5431          of the guard block that does the skip.  */
5432       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5433       gcc_assert (skip_edge);
5434       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5435       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5436     }
5437   else
5438     {
5439       /* The normal case: emit the initialization instructions on the
5440          preheader edge.  */
5441       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5442       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5443     }
5444 }
5445
5446 /* Function get_initial_def_for_reduction
5447
5448    Input:
5449    REDUC_INFO - the info_for_reduction
5450    INIT_VAL - the initial value of the reduction variable
5451    NEUTRAL_OP - a value that has no effect on the reduction, as per
5452                 neutral_op_for_reduction
5453
5454    Output:
5455    Return a vector variable, initialized according to the operation that
5456         STMT_VINFO performs. This vector will be used as the initial value
5457         of the vector of partial results.
5458
5459    The value we need is a vector in which element 0 has value INIT_VAL
5460    and every other element has value NEUTRAL_OP.  */
5461
5462 static tree
5463 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5464                                stmt_vec_info reduc_info,
5465                                tree init_val, tree neutral_op)
5466 {
5467   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5468   tree scalar_type = TREE_TYPE (init_val);
5469   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5470   tree init_def;
5471   gimple_seq stmts = NULL;
5472
5473   gcc_assert (vectype);
5474
5475   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5476               || SCALAR_FLOAT_TYPE_P (scalar_type));
5477
5478   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5479               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5480
5481   if (operand_equal_p (init_val, neutral_op))
5482     {
5483       /* If both elements are equal then the vector described above is
5484          just a splat.  */
5485       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5486       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5487     }
5488   else
5489     {
5490       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5491       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5492       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5493         {
5494           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5495              element 0.  */
5496           init_def = gimple_build_vector_from_val (&stmts, vectype,
5497                                                    neutral_op);
5498           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5499                                    vectype, init_def, init_val);
5500         }
5501       else
5502         {
5503           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5504           tree_vector_builder elts (vectype, 1, 2);
5505           elts.quick_push (init_val);
5506           elts.quick_push (neutral_op);
5507           init_def = gimple_build_vector (&stmts, &elts);
5508         }
5509     }
5510
5511   if (stmts)
5512     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5513   return init_def;
5514 }
5515
5516 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5517    which performs a reduction involving GROUP_SIZE scalar statements.
5518    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5519    is nonnull, introducing extra elements of that value will not change the
5520    result.  */
5521
5522 static void
5523 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5524                                 stmt_vec_info reduc_info,
5525                                 vec<tree> *vec_oprnds,
5526                                 unsigned int number_of_vectors,
5527                                 unsigned int group_size, tree neutral_op)
5528 {
5529   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5530   unsigned HOST_WIDE_INT nunits;
5531   unsigned j, number_of_places_left_in_vector;
5532   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5533   unsigned int i;
5534
5535   gcc_assert (group_size == initial_values.length () || neutral_op);
5536
5537   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5538      created vectors. It is greater than 1 if unrolling is performed.
5539
5540      For example, we have two scalar operands, s1 and s2 (e.g., group of
5541      strided accesses of size two), while NUNITS is four (i.e., four scalars
5542      of this type can be packed in a vector).  The output vector will contain
5543      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5544      will be 2).
5545
5546      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5547      vectors containing the operands.
5548
5549      For example, NUNITS is four as before, and the group size is 8
5550      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5551      {s5, s6, s7, s8}.  */
5552
5553   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5554     nunits = group_size;
5555
5556   number_of_places_left_in_vector = nunits;
5557   bool constant_p = true;
5558   tree_vector_builder elts (vector_type, nunits, 1);
5559   elts.quick_grow (nunits);
5560   gimple_seq ctor_seq = NULL;
5561   for (j = 0; j < nunits * number_of_vectors; ++j)
5562     {
5563       tree op;
5564       i = j % group_size;
5565
5566       /* Get the def before the loop.  In reduction chain we have only
5567          one initial value.  Else we have as many as PHIs in the group.  */
5568       if (i >= initial_values.length () || (j > i && neutral_op))
5569         op = neutral_op;
5570       else
5571         op = initial_values[i];
5572
5573       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5574       number_of_places_left_in_vector--;
5575       elts[nunits - number_of_places_left_in_vector - 1] = op;
5576       if (!CONSTANT_CLASS_P (op))
5577         constant_p = false;
5578
5579       if (number_of_places_left_in_vector == 0)
5580         {
5581           tree init;
5582           if (constant_p && !neutral_op
5583               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5584               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5585             /* Build the vector directly from ELTS.  */
5586             init = gimple_build_vector (&ctor_seq, &elts);
5587           else if (neutral_op)
5588             {
5589               /* Build a vector of the neutral value and shift the
5590                  other elements into place.  */
5591               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5592                                                    neutral_op);
5593               int k = nunits;
5594               while (k > 0 && elts[k - 1] == neutral_op)
5595                 k -= 1;
5596               while (k > 0)
5597                 {
5598                   k -= 1;
5599                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5600                                        vector_type, init, elts[k]);
5601                 }
5602             }
5603           else
5604             {
5605               /* First time round, duplicate ELTS to fill the
5606                  required number of vectors.  */
5607               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5608                                         elts, number_of_vectors, *vec_oprnds);
5609               break;
5610             }
5611           vec_oprnds->quick_push (init);
5612
5613           number_of_places_left_in_vector = nunits;
5614           elts.new_vector (vector_type, nunits, 1);
5615           elts.quick_grow (nunits);
5616           constant_p = true;
5617         }
5618     }
5619   if (ctor_seq != NULL)
5620     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5621 }
5622
5623 /* For a statement STMT_INFO taking part in a reduction operation return
5624    the stmt_vec_info the meta information is stored on.  */
5625
5626 stmt_vec_info
5627 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5628 {
5629   stmt_info = vect_orig_stmt (stmt_info);
5630   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5631   if (!is_a <gphi *> (stmt_info->stmt)
5632       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5633     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5634   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5635   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5636     {
5637       if (gimple_phi_num_args (phi) == 1)
5638         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5639     }
5640   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5641     {
5642       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5643       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5644         stmt_info = info;
5645     }
5646   return stmt_info;
5647 }
5648
5649 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5650    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5651    return false.  */
5652
5653 static bool
5654 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5655                                 stmt_vec_info reduc_info)
5656 {
5657   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5658   if (!main_loop_vinfo)
5659     return false;
5660
5661   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5662     return false;
5663
5664   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5665   auto_vec<tree, 16> main_loop_results (num_phis);
5666   auto_vec<tree, 16> initial_values (num_phis);
5667   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5668     {
5669       /* The epilogue loop can be entered either from the main loop or
5670          from an earlier guard block.  */
5671       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5672       for (tree incoming_value : reduc_info->reduc_initial_values)
5673         {
5674           /* Look for:
5675
5676                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5677                                     INITIAL_VALUE(guard block)>.  */
5678           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5679
5680           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5681           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5682
5683           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5684           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5685
5686           main_loop_results.quick_push (from_main_loop);
5687           initial_values.quick_push (from_skip);
5688         }
5689     }
5690   else
5691     /* The main loop dominates the epilogue loop.  */
5692     main_loop_results.splice (reduc_info->reduc_initial_values);
5693
5694   /* See if the main loop has the kind of accumulator we need.  */
5695   vect_reusable_accumulator *accumulator
5696     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5697   if (!accumulator
5698       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5699       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5700                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5701     return false;
5702
5703   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5704   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5705   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5706   unsigned HOST_WIDE_INT m;
5707   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5708                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5709     return false;
5710   /* Check the intermediate vector types and operations are available.  */
5711   tree prev_vectype = old_vectype;
5712   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5713   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5714     {
5715       intermediate_nunits = exact_div (intermediate_nunits, 2);
5716       tree intermediate_vectype = get_related_vectype_for_scalar_type
5717         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5718       if (!intermediate_vectype
5719           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5720                                     intermediate_vectype)
5721           || !can_vec_extract (TYPE_MODE (prev_vectype),
5722                                TYPE_MODE (intermediate_vectype)))
5723         return false;
5724       prev_vectype = intermediate_vectype;
5725     }
5726
5727   /* Non-SLP reductions might apply an adjustment after the reduction
5728      operation, in order to simplify the initialization of the accumulator.
5729      If the epilogue loop carries on from where the main loop left off,
5730      it should apply the same adjustment to the final reduction result.
5731
5732      If the epilogue loop can also be entered directly (rather than via
5733      the main loop), we need to be able to handle that case in the same way,
5734      with the same adjustment.  (In principle we could add a PHI node
5735      to select the correct adjustment, but in practice that shouldn't be
5736      necessary.)  */
5737   tree main_adjustment
5738     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5739   if (loop_vinfo->main_loop_edge && main_adjustment)
5740     {
5741       gcc_assert (num_phis == 1);
5742       tree initial_value = initial_values[0];
5743       /* Check that we can use INITIAL_VALUE as the adjustment and
5744          initialize the accumulator with a neutral value instead.  */
5745       if (!operand_equal_p (initial_value, main_adjustment))
5746         return false;
5747       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5748       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5749                                                     code, initial_value);
5750     }
5751   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5752   reduc_info->reduc_initial_values.truncate (0);
5753   reduc_info->reduc_initial_values.splice (initial_values);
5754   reduc_info->reused_accumulator = accumulator;
5755   return true;
5756 }
5757
5758 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5759    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5760
5761 static tree
5762 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5763                             gimple_seq *seq)
5764 {
5765   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5766   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5767   tree stype = TREE_TYPE (vectype);
5768   tree new_temp = vec_def;
5769   while (nunits > nunits1)
5770     {
5771       nunits /= 2;
5772       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5773                                                            stype, nunits);
5774       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5775
5776       /* The target has to make sure we support lowpart/highpart
5777          extraction, either via direct vector extract or through
5778          an integer mode punning.  */
5779       tree dst1, dst2;
5780       gimple *epilog_stmt;
5781       if (convert_optab_handler (vec_extract_optab,
5782                                  TYPE_MODE (TREE_TYPE (new_temp)),
5783                                  TYPE_MODE (vectype1))
5784           != CODE_FOR_nothing)
5785         {
5786           /* Extract sub-vectors directly once vec_extract becomes
5787              a conversion optab.  */
5788           dst1 = make_ssa_name (vectype1);
5789           epilog_stmt
5790               = gimple_build_assign (dst1, BIT_FIELD_REF,
5791                                      build3 (BIT_FIELD_REF, vectype1,
5792                                              new_temp, TYPE_SIZE (vectype1),
5793                                              bitsize_int (0)));
5794           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5795           dst2 =  make_ssa_name (vectype1);
5796           epilog_stmt
5797               = gimple_build_assign (dst2, BIT_FIELD_REF,
5798                                      build3 (BIT_FIELD_REF, vectype1,
5799                                              new_temp, TYPE_SIZE (vectype1),
5800                                              bitsize_int (bitsize)));
5801           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5802         }
5803       else
5804         {
5805           /* Extract via punning to appropriately sized integer mode
5806              vector.  */
5807           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5808           tree etype = build_vector_type (eltype, 2);
5809           gcc_assert (convert_optab_handler (vec_extract_optab,
5810                                              TYPE_MODE (etype),
5811                                              TYPE_MODE (eltype))
5812                       != CODE_FOR_nothing);
5813           tree tem = make_ssa_name (etype);
5814           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5815                                              build1 (VIEW_CONVERT_EXPR,
5816                                                      etype, new_temp));
5817           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5818           new_temp = tem;
5819           tem = make_ssa_name (eltype);
5820           epilog_stmt
5821               = gimple_build_assign (tem, BIT_FIELD_REF,
5822                                      build3 (BIT_FIELD_REF, eltype,
5823                                              new_temp, TYPE_SIZE (eltype),
5824                                              bitsize_int (0)));
5825           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5826           dst1 = make_ssa_name (vectype1);
5827           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5828                                              build1 (VIEW_CONVERT_EXPR,
5829                                                      vectype1, tem));
5830           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5831           tem = make_ssa_name (eltype);
5832           epilog_stmt
5833               = gimple_build_assign (tem, BIT_FIELD_REF,
5834                                      build3 (BIT_FIELD_REF, eltype,
5835                                              new_temp, TYPE_SIZE (eltype),
5836                                              bitsize_int (bitsize)));
5837           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5838           dst2 =  make_ssa_name (vectype1);
5839           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5840                                              build1 (VIEW_CONVERT_EXPR,
5841                                                      vectype1, tem));
5842           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5843         }
5844
5845       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5846     }
5847
5848   return new_temp;
5849 }
5850
5851 /* Function vect_create_epilog_for_reduction
5852
5853    Create code at the loop-epilog to finalize the result of a reduction
5854    computation.
5855
5856    STMT_INFO is the scalar reduction stmt that is being vectorized.
5857    SLP_NODE is an SLP node containing a group of reduction statements. The
5858      first one in this group is STMT_INFO.
5859    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5860    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5861      (counting from 0)
5862
5863    This function:
5864    1. Completes the reduction def-use cycles.
5865    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5866       by calling the function specified by REDUC_FN if available, or by
5867       other means (whole-vector shifts or a scalar loop).
5868       The function also creates a new phi node at the loop exit to preserve
5869       loop-closed form, as illustrated below.
5870
5871      The flow at the entry to this function:
5872
5873         loop:
5874           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5875           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5876           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5877         loop_exit:
5878           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5879           use <s_out0>
5880           use <s_out0>
5881
5882      The above is transformed by this function into:
5883
5884         loop:
5885           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5886           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5887           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5888         loop_exit:
5889           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5890           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5891           v_out2 = reduce <v_out1>
5892           s_out3 = extract_field <v_out2, 0>
5893           s_out4 = adjust_result <s_out3>
5894           use <s_out4>
5895           use <s_out4>
5896 */
5897
5898 static void
5899 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5900                                   stmt_vec_info stmt_info,
5901                                   slp_tree slp_node,
5902                                   slp_instance slp_node_instance)
5903 {
5904   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5905   gcc_assert (reduc_info->is_reduc_info);
5906   /* For double reductions we need to get at the inner loop reduction
5907      stmt which has the meta info attached.  Our stmt_info is that of the
5908      loop-closed PHI of the inner loop which we remember as
5909      def for the reduction PHI generation.  */
5910   bool double_reduc = false;
5911   stmt_vec_info rdef_info = stmt_info;
5912   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5913     {
5914       gcc_assert (!slp_node);
5915       double_reduc = true;
5916       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5917                                             (stmt_info->stmt, 0));
5918       stmt_info = vect_stmt_to_vectorize (stmt_info);
5919     }
5920   gphi *reduc_def_stmt
5921     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5922   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5923   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5924   tree vectype;
5925   machine_mode mode;
5926   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5927   basic_block exit_bb;
5928   tree scalar_dest;
5929   tree scalar_type;
5930   gimple *new_phi = NULL, *phi = NULL;
5931   gimple_stmt_iterator exit_gsi;
5932   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5933   gimple *epilog_stmt = NULL;
5934   gimple *exit_phi;
5935   tree bitsize;
5936   tree def;
5937   tree orig_name, scalar_result;
5938   imm_use_iterator imm_iter, phi_imm_iter;
5939   use_operand_p use_p, phi_use_p;
5940   gimple *use_stmt;
5941   auto_vec<tree> reduc_inputs;
5942   int j, i;
5943   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5944   unsigned int group_size = 1, k;
5945   auto_vec<gimple *> phis;
5946   /* SLP reduction without reduction chain, e.g.,
5947      # a1 = phi <a2, a0>
5948      # b1 = phi <b2, b0>
5949      a2 = operation (a1)
5950      b2 = operation (b1)  */
5951   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5952   bool direct_slp_reduc;
5953   tree induction_index = NULL_TREE;
5954
5955   if (slp_node)
5956     group_size = SLP_TREE_LANES (slp_node);
5957
5958   if (nested_in_vect_loop_p (loop, stmt_info))
5959     {
5960       outer_loop = loop;
5961       loop = loop->inner;
5962       gcc_assert (!slp_node && double_reduc);
5963     }
5964
5965   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5966   gcc_assert (vectype);
5967   mode = TYPE_MODE (vectype);
5968
5969   tree induc_val = NULL_TREE;
5970   tree adjustment_def = NULL;
5971   if (slp_node)
5972     ;
5973   else
5974     {
5975       /* Optimize: for induction condition reduction, if we can't use zero
5976          for induc_val, use initial_def.  */
5977       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5978         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5979       else if (double_reduc)
5980         ;
5981       else
5982         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5983     }
5984
5985   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5986   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5987   if (slp_reduc)
5988     /* All statements produce live-out values.  */
5989     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5990   else if (slp_node)
5991     {
5992       /* The last statement in the reduction chain produces the live-out
5993          value.  Note SLP optimization can shuffle scalar stmts to
5994          optimize permutations so we have to search for the last stmt.  */
5995       for (k = 0; k < group_size; ++k)
5996         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5997           {
5998             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5999             break;
6000           }
6001     }
6002
6003   unsigned vec_num;
6004   int ncopies;
6005   if (slp_node)
6006     {
6007       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6008       ncopies = 1;
6009     }
6010   else
6011     {
6012       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6013       vec_num = 1;
6014       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6015     }
6016
6017   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6018      which is updated with the current index of the loop for every match of
6019      the original loop's cond_expr (VEC_STMT).  This results in a vector
6020      containing the last time the condition passed for that vector lane.
6021      The first match will be a 1 to allow 0 to be used for non-matching
6022      indexes.  If there are no matches at all then the vector will be all
6023      zeroes.
6024
6025      PR92772: This algorithm is broken for architectures that support
6026      masked vectors, but do not provide fold_extract_last.  */
6027   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6028     {
6029       auto_vec<std::pair<tree, bool>, 2> ccompares;
6030       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6031       cond_info = vect_stmt_to_vectorize (cond_info);
6032       while (cond_info != reduc_info)
6033         {
6034           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6035             {
6036               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6037               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6038               ccompares.safe_push
6039                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6040                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6041             }
6042           cond_info
6043             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6044                                                  1 + STMT_VINFO_REDUC_IDX
6045                                                         (cond_info)));
6046           cond_info = vect_stmt_to_vectorize (cond_info);
6047         }
6048       gcc_assert (ccompares.length () != 0);
6049
6050       tree indx_before_incr, indx_after_incr;
6051       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6052       int scalar_precision
6053         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6054       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6055       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6056         (TYPE_MODE (vectype), cr_index_scalar_type,
6057          TYPE_VECTOR_SUBPARTS (vectype));
6058
6059       /* First we create a simple vector induction variable which starts
6060          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6061          vector size (STEP).  */
6062
6063       /* Create a {1,2,3,...} vector.  */
6064       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6065
6066       /* Create a vector of the step value.  */
6067       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6068       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6069
6070       /* Create an induction variable.  */
6071       gimple_stmt_iterator incr_gsi;
6072       bool insert_after;
6073       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6074       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6075                  insert_after, &indx_before_incr, &indx_after_incr);
6076
6077       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6078          filled with zeros (VEC_ZERO).  */
6079
6080       /* Create a vector of 0s.  */
6081       tree zero = build_zero_cst (cr_index_scalar_type);
6082       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6083
6084       /* Create a vector phi node.  */
6085       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6086       new_phi = create_phi_node (new_phi_tree, loop->header);
6087       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6088                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6089
6090       /* Now take the condition from the loops original cond_exprs
6091          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6092          every match uses values from the induction variable
6093          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6094          (NEW_PHI_TREE).
6095          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6096          the new cond_expr (INDEX_COND_EXPR).  */
6097       gimple_seq stmts = NULL;
6098       for (int i = ccompares.length () - 1; i != -1; --i)
6099         {
6100           tree ccompare = ccompares[i].first;
6101           if (ccompares[i].second)
6102             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6103                                          cr_index_vector_type,
6104                                          ccompare,
6105                                          indx_before_incr, new_phi_tree);
6106           else
6107             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6108                                          cr_index_vector_type,
6109                                          ccompare,
6110                                          new_phi_tree, indx_before_incr);
6111         }
6112       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6113
6114       /* Update the phi with the vec cond.  */
6115       induction_index = new_phi_tree;
6116       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6117                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6118     }
6119
6120   /* 2. Create epilog code.
6121         The reduction epilog code operates across the elements of the vector
6122         of partial results computed by the vectorized loop.
6123         The reduction epilog code consists of:
6124
6125         step 1: compute the scalar result in a vector (v_out2)
6126         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6127         step 3: adjust the scalar result (s_out3) if needed.
6128
6129         Step 1 can be accomplished using one the following three schemes:
6130           (scheme 1) using reduc_fn, if available.
6131           (scheme 2) using whole-vector shifts, if available.
6132           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6133                      combined.
6134
6135           The overall epilog code looks like this:
6136
6137           s_out0 = phi <s_loop>         # original EXIT_PHI
6138           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6139           v_out2 = reduce <v_out1>              # step 1
6140           s_out3 = extract_field <v_out2, 0>    # step 2
6141           s_out4 = adjust_result <s_out3>       # step 3
6142
6143           (step 3 is optional, and steps 1 and 2 may be combined).
6144           Lastly, the uses of s_out0 are replaced by s_out4.  */
6145
6146
6147   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6148          v_out1 = phi <VECT_DEF>
6149          Store them in NEW_PHIS.  */
6150   if (double_reduc)
6151     loop = outer_loop;
6152   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6153   exit_gsi = gsi_after_labels (exit_bb);
6154   reduc_inputs.create (slp_node ? vec_num : ncopies);
6155   for (unsigned i = 0; i < vec_num; i++)
6156     {
6157       gimple_seq stmts = NULL;
6158       if (slp_node)
6159         def = vect_get_slp_vect_def (slp_node, i);
6160       else
6161         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6162       for (j = 0; j < ncopies; j++)
6163         {
6164           tree new_def = copy_ssa_name (def);
6165           phi = create_phi_node (new_def, exit_bb);
6166           if (j)
6167             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6168           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6169           new_def = gimple_convert (&stmts, vectype, new_def);
6170           reduc_inputs.quick_push (new_def);
6171         }
6172       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6173     }
6174
6175   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6176          (i.e. when reduc_fn is not available) and in the final adjustment
6177          code (if needed).  Also get the original scalar reduction variable as
6178          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6179          represents a reduction pattern), the tree-code and scalar-def are
6180          taken from the original stmt that the pattern-stmt (STMT) replaces.
6181          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6182          are taken from STMT.  */
6183
6184   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6185   if (orig_stmt_info != stmt_info)
6186     {
6187       /* Reduction pattern  */
6188       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6189       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6190     }
6191
6192   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6193   scalar_type = TREE_TYPE (scalar_dest);
6194   scalar_results.truncate (0);
6195   scalar_results.reserve_exact (group_size);
6196   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6197   bitsize = TYPE_SIZE (scalar_type);
6198
6199   /* True if we should implement SLP_REDUC using native reduction operations
6200      instead of scalar operations.  */
6201   direct_slp_reduc = (reduc_fn != IFN_LAST
6202                       && slp_reduc
6203                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6204
6205   /* In case of reduction chain, e.g.,
6206      # a1 = phi <a3, a0>
6207      a2 = operation (a1)
6208      a3 = operation (a2),
6209
6210      we may end up with more than one vector result.  Here we reduce them
6211      to one vector.
6212
6213      The same is true for a SLP reduction, e.g.,
6214      # a1 = phi <a2, a0>
6215      # b1 = phi <b2, b0>
6216      a2 = operation (a1)
6217      b2 = operation (a2),
6218
6219      where we can end up with more than one vector as well.  We can
6220      easily accumulate vectors when the number of vector elements is
6221      a multiple of the SLP group size.
6222
6223      The same is true if we couldn't use a single defuse cycle.  */
6224   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6225       || direct_slp_reduc
6226       || (slp_reduc
6227           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6228       || ncopies > 1)
6229     {
6230       gimple_seq stmts = NULL;
6231       tree single_input = reduc_inputs[0];
6232       for (k = 1; k < reduc_inputs.length (); k++)
6233         single_input = gimple_build (&stmts, code, vectype,
6234                                      single_input, reduc_inputs[k]);
6235       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6236
6237       reduc_inputs.truncate (0);
6238       reduc_inputs.safe_push (single_input);
6239     }
6240
6241   tree orig_reduc_input = reduc_inputs[0];
6242
6243   /* If this loop is an epilogue loop that can be skipped after the
6244      main loop, we can only share a reduction operation between the
6245      main loop and the epilogue if we put it at the target of the
6246      skip edge.
6247
6248      We can still reuse accumulators if this check fails.  Doing so has
6249      the minor(?) benefit of making the epilogue loop's scalar result
6250      independent of the main loop's scalar result.  */
6251   bool unify_with_main_loop_p = false;
6252   if (reduc_info->reused_accumulator
6253       && loop_vinfo->skip_this_loop_edge
6254       && single_succ_p (exit_bb)
6255       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6256     {
6257       unify_with_main_loop_p = true;
6258
6259       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6260       reduc_inputs[0] = make_ssa_name (vectype);
6261       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6262       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6263                    UNKNOWN_LOCATION);
6264       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6265                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6266       exit_gsi = gsi_after_labels (reduc_block);
6267     }
6268
6269   /* Shouldn't be used beyond this point.  */
6270   exit_bb = nullptr;
6271
6272   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6273       && reduc_fn != IFN_LAST)
6274     {
6275       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6276          various data values where the condition matched and another vector
6277          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6278          need to extract the last matching index (which will be the index with
6279          highest value) and use this to index into the data vector.
6280          For the case where there were no matches, the data vector will contain
6281          all default values and the index vector will be all zeros.  */
6282
6283       /* Get various versions of the type of the vector of indexes.  */
6284       tree index_vec_type = TREE_TYPE (induction_index);
6285       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6286       tree index_scalar_type = TREE_TYPE (index_vec_type);
6287       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6288
6289       /* Get an unsigned integer version of the type of the data vector.  */
6290       int scalar_precision
6291         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6292       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6293       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6294                                                 vectype);
6295
6296       /* First we need to create a vector (ZERO_VEC) of zeros and another
6297          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6298          can create using a MAX reduction and then expanding.
6299          In the case where the loop never made any matches, the max index will
6300          be zero.  */
6301
6302       /* Vector of {0, 0, 0,...}.  */
6303       tree zero_vec = build_zero_cst (vectype);
6304
6305       /* Find maximum value from the vector of found indexes.  */
6306       tree max_index = make_ssa_name (index_scalar_type);
6307       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6308                                                           1, induction_index);
6309       gimple_call_set_lhs (max_index_stmt, max_index);
6310       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6311
6312       /* Vector of {max_index, max_index, max_index,...}.  */
6313       tree max_index_vec = make_ssa_name (index_vec_type);
6314       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6315                                                       max_index);
6316       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6317                                                         max_index_vec_rhs);
6318       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6319
6320       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6321          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6322          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6323          otherwise.  Only one value should match, resulting in a vector
6324          (VEC_COND) with one data value and the rest zeros.
6325          In the case where the loop never made any matches, every index will
6326          match, resulting in a vector with all data values (which will all be
6327          the default value).  */
6328
6329       /* Compare the max index vector to the vector of found indexes to find
6330          the position of the max value.  */
6331       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6332       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6333                                                       induction_index,
6334                                                       max_index_vec);
6335       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6336
6337       /* Use the compare to choose either values from the data vector or
6338          zero.  */
6339       tree vec_cond = make_ssa_name (vectype);
6340       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6341                                                    vec_compare,
6342                                                    reduc_inputs[0],
6343                                                    zero_vec);
6344       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6345
6346       /* Finally we need to extract the data value from the vector (VEC_COND)
6347          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6348          reduction, but because this doesn't exist, we can use a MAX reduction
6349          instead.  The data value might be signed or a float so we need to cast
6350          it first.
6351          In the case where the loop never made any matches, the data values are
6352          all identical, and so will reduce down correctly.  */
6353
6354       /* Make the matched data values unsigned.  */
6355       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6356       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6357                                        vec_cond);
6358       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6359                                                         VIEW_CONVERT_EXPR,
6360                                                         vec_cond_cast_rhs);
6361       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6362
6363       /* Reduce down to a scalar value.  */
6364       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6365       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6366                                                            1, vec_cond_cast);
6367       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6368       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6369
6370       /* Convert the reduced value back to the result type and set as the
6371          result.  */
6372       gimple_seq stmts = NULL;
6373       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6374                                data_reduc);
6375       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6376       scalar_results.safe_push (new_temp);
6377     }
6378   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6379            && reduc_fn == IFN_LAST)
6380     {
6381       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6382          idx = 0;
6383          idx_val = induction_index[0];
6384          val = data_reduc[0];
6385          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6386            if (induction_index[i] > idx_val)
6387              val = data_reduc[i], idx_val = induction_index[i];
6388          return val;  */
6389
6390       tree data_eltype = TREE_TYPE (vectype);
6391       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6392       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6393       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6394       /* Enforced by vectorizable_reduction, which ensures we have target
6395          support before allowing a conditional reduction on variable-length
6396          vectors.  */
6397       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6398       tree idx_val = NULL_TREE, val = NULL_TREE;
6399       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6400         {
6401           tree old_idx_val = idx_val;
6402           tree old_val = val;
6403           idx_val = make_ssa_name (idx_eltype);
6404           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6405                                              build3 (BIT_FIELD_REF, idx_eltype,
6406                                                      induction_index,
6407                                                      bitsize_int (el_size),
6408                                                      bitsize_int (off)));
6409           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6410           val = make_ssa_name (data_eltype);
6411           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6412                                              build3 (BIT_FIELD_REF,
6413                                                      data_eltype,
6414                                                      reduc_inputs[0],
6415                                                      bitsize_int (el_size),
6416                                                      bitsize_int (off)));
6417           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6418           if (off != 0)
6419             {
6420               tree new_idx_val = idx_val;
6421               if (off != v_size - el_size)
6422                 {
6423                   new_idx_val = make_ssa_name (idx_eltype);
6424                   epilog_stmt = gimple_build_assign (new_idx_val,
6425                                                      MAX_EXPR, idx_val,
6426                                                      old_idx_val);
6427                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6428                 }
6429               tree cond = make_ssa_name (boolean_type_node);
6430               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6431                                                  idx_val, old_idx_val);
6432               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6433               tree new_val = make_ssa_name (data_eltype);
6434               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6435                                                  cond, val, old_val);
6436               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6437               idx_val = new_idx_val;
6438               val = new_val;
6439             }
6440         }
6441       /* Convert the reduced value back to the result type and set as the
6442          result.  */
6443       gimple_seq stmts = NULL;
6444       val = gimple_convert (&stmts, scalar_type, val);
6445       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6446       scalar_results.safe_push (val);
6447     }
6448
6449   /* 2.3 Create the reduction code, using one of the three schemes described
6450          above. In SLP we simply need to extract all the elements from the
6451          vector (without reducing them), so we use scalar shifts.  */
6452   else if (reduc_fn != IFN_LAST && !slp_reduc)
6453     {
6454       tree tmp;
6455       tree vec_elem_type;
6456
6457       /* Case 1:  Create:
6458          v_out2 = reduc_expr <v_out1>  */
6459
6460       if (dump_enabled_p ())
6461         dump_printf_loc (MSG_NOTE, vect_location,
6462                          "Reduce using direct vector reduction.\n");
6463
6464       gimple_seq stmts = NULL;
6465       vec_elem_type = TREE_TYPE (vectype);
6466       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6467                                vec_elem_type, reduc_inputs[0]);
6468       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6469       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6470
6471       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6472           && induc_val)
6473         {
6474           /* Earlier we set the initial value to be a vector if induc_val
6475              values.  Check the result and if it is induc_val then replace
6476              with the original initial value, unless induc_val is
6477              the same as initial_def already.  */
6478           tree zcompare = make_ssa_name (boolean_type_node);
6479           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6480                                              new_temp, induc_val);
6481           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6482           tree initial_def = reduc_info->reduc_initial_values[0];
6483           tmp = make_ssa_name (new_scalar_dest);
6484           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6485                                              initial_def, new_temp);
6486           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6487           new_temp = tmp;
6488         }
6489
6490       scalar_results.safe_push (new_temp);
6491     }
6492   else if (direct_slp_reduc)
6493     {
6494       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6495          with the elements for other SLP statements replaced with the
6496          neutral value.  We can then do a normal reduction on each vector.  */
6497
6498       /* Enforced by vectorizable_reduction.  */
6499       gcc_assert (reduc_inputs.length () == 1);
6500       gcc_assert (pow2p_hwi (group_size));
6501
6502       gimple_seq seq = NULL;
6503
6504       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6505          and the same element size as VECTYPE.  */
6506       tree index = build_index_vector (vectype, 0, 1);
6507       tree index_type = TREE_TYPE (index);
6508       tree index_elt_type = TREE_TYPE (index_type);
6509       tree mask_type = truth_type_for (index_type);
6510
6511       /* Create a vector that, for each element, identifies which of
6512          the REDUC_GROUP_SIZE results should use it.  */
6513       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6514       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6515                             build_vector_from_val (index_type, index_mask));
6516
6517       /* Get a neutral vector value.  This is simply a splat of the neutral
6518          scalar value if we have one, otherwise the initial scalar value
6519          is itself a neutral value.  */
6520       tree vector_identity = NULL_TREE;
6521       tree neutral_op = NULL_TREE;
6522       if (slp_node)
6523         {
6524           tree initial_value = NULL_TREE;
6525           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6526             initial_value = reduc_info->reduc_initial_values[0];
6527           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6528                                                  initial_value, false);
6529         }
6530       if (neutral_op)
6531         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6532                                                         neutral_op);
6533       for (unsigned int i = 0; i < group_size; ++i)
6534         {
6535           /* If there's no univeral neutral value, we can use the
6536              initial scalar value from the original PHI.  This is used
6537              for MIN and MAX reduction, for example.  */
6538           if (!neutral_op)
6539             {
6540               tree scalar_value = reduc_info->reduc_initial_values[i];
6541               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6542                                              scalar_value);
6543               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6544                                                               scalar_value);
6545             }
6546
6547           /* Calculate the equivalent of:
6548
6549              sel[j] = (index[j] == i);
6550
6551              which selects the elements of REDUC_INPUTS[0] that should
6552              be included in the result.  */
6553           tree compare_val = build_int_cst (index_elt_type, i);
6554           compare_val = build_vector_from_val (index_type, compare_val);
6555           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6556                                    index, compare_val);
6557
6558           /* Calculate the equivalent of:
6559
6560              vec = seq ? reduc_inputs[0] : vector_identity;
6561
6562              VEC is now suitable for a full vector reduction.  */
6563           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6564                                    sel, reduc_inputs[0], vector_identity);
6565
6566           /* Do the reduction and convert it to the appropriate type.  */
6567           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6568                                       TREE_TYPE (vectype), vec);
6569           scalar = gimple_convert (&seq, scalar_type, scalar);
6570           scalar_results.safe_push (scalar);
6571         }
6572       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6573     }
6574   else
6575     {
6576       bool reduce_with_shift;
6577       tree vec_temp;
6578
6579       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6580
6581       /* See if the target wants to do the final (shift) reduction
6582          in a vector mode of smaller size and first reduce upper/lower
6583          halves against each other.  */
6584       enum machine_mode mode1 = mode;
6585       tree stype = TREE_TYPE (vectype);
6586       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6587       unsigned nunits1 = nunits;
6588       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6589           && reduc_inputs.length () == 1)
6590         {
6591           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6592           /* For SLP reductions we have to make sure lanes match up, but
6593              since we're doing individual element final reduction reducing
6594              vector width here is even more important.
6595              ???  We can also separate lanes with permutes, for the common
6596              case of power-of-two group-size odd/even extracts would work.  */
6597           if (slp_reduc && nunits != nunits1)
6598             {
6599               nunits1 = least_common_multiple (nunits1, group_size);
6600               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6601             }
6602         }
6603       if (!slp_reduc
6604           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6605         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6606
6607       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6608                                                            stype, nunits1);
6609       reduce_with_shift = have_whole_vector_shift (mode1);
6610       if (!VECTOR_MODE_P (mode1)
6611           || !directly_supported_p (code, vectype1))
6612         reduce_with_shift = false;
6613
6614       /* First reduce the vector to the desired vector size we should
6615          do shift reduction on by combining upper and lower halves.  */
6616       gimple_seq stmts = NULL;
6617       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6618                                              code, &stmts);
6619       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6620       reduc_inputs[0] = new_temp;
6621
6622       if (reduce_with_shift && !slp_reduc)
6623         {
6624           int element_bitsize = tree_to_uhwi (bitsize);
6625           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6626              for variable-length vectors and also requires direct target support
6627              for loop reductions.  */
6628           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6629           int nelements = vec_size_in_bits / element_bitsize;
6630           vec_perm_builder sel;
6631           vec_perm_indices indices;
6632
6633           int elt_offset;
6634
6635           tree zero_vec = build_zero_cst (vectype1);
6636           /* Case 2: Create:
6637              for (offset = nelements/2; offset >= 1; offset/=2)
6638                 {
6639                   Create:  va' = vec_shift <va, offset>
6640                   Create:  va = vop <va, va'>
6641                 }  */
6642
6643           tree rhs;
6644
6645           if (dump_enabled_p ())
6646             dump_printf_loc (MSG_NOTE, vect_location,
6647                              "Reduce using vector shifts\n");
6648
6649           gimple_seq stmts = NULL;
6650           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6651           for (elt_offset = nelements / 2;
6652                elt_offset >= 1;
6653                elt_offset /= 2)
6654             {
6655               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6656               indices.new_vector (sel, 2, nelements);
6657               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6658               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6659                                        new_temp, zero_vec, mask);
6660               new_temp = gimple_build (&stmts, code,
6661                                        vectype1, new_name, new_temp);
6662             }
6663           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6664
6665           /* 2.4  Extract the final scalar result.  Create:
6666              s_out3 = extract_field <v_out2, bitpos>  */
6667
6668           if (dump_enabled_p ())
6669             dump_printf_loc (MSG_NOTE, vect_location,
6670                              "extract scalar result\n");
6671
6672           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6673                         bitsize, bitsize_zero_node);
6674           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6675           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6676           gimple_assign_set_lhs (epilog_stmt, new_temp);
6677           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6678           scalar_results.safe_push (new_temp);
6679         }
6680       else
6681         {
6682           /* Case 3: Create:
6683              s = extract_field <v_out2, 0>
6684              for (offset = element_size;
6685                   offset < vector_size;
6686                   offset += element_size;)
6687                {
6688                  Create:  s' = extract_field <v_out2, offset>
6689                  Create:  s = op <s, s'>  // For non SLP cases
6690                }  */
6691
6692           if (dump_enabled_p ())
6693             dump_printf_loc (MSG_NOTE, vect_location,
6694                              "Reduce using scalar code.\n");
6695
6696           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6697           int element_bitsize = tree_to_uhwi (bitsize);
6698           tree compute_type = TREE_TYPE (vectype);
6699           gimple_seq stmts = NULL;
6700           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6701             {
6702               int bit_offset;
6703               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6704                                        vec_temp, bitsize, bitsize_zero_node);
6705
6706               /* In SLP we don't need to apply reduction operation, so we just
6707                  collect s' values in SCALAR_RESULTS.  */
6708               if (slp_reduc)
6709                 scalar_results.safe_push (new_temp);
6710
6711               for (bit_offset = element_bitsize;
6712                    bit_offset < vec_size_in_bits;
6713                    bit_offset += element_bitsize)
6714                 {
6715                   tree bitpos = bitsize_int (bit_offset);
6716                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6717                                            compute_type, vec_temp,
6718                                            bitsize, bitpos);
6719                   if (slp_reduc)
6720                     {
6721                       /* In SLP we don't need to apply reduction operation, so
6722                          we just collect s' values in SCALAR_RESULTS.  */
6723                       new_temp = new_name;
6724                       scalar_results.safe_push (new_name);
6725                     }
6726                   else
6727                     new_temp = gimple_build (&stmts, code, compute_type,
6728                                              new_name, new_temp);
6729                 }
6730             }
6731
6732           /* The only case where we need to reduce scalar results in SLP, is
6733              unrolling.  If the size of SCALAR_RESULTS is greater than
6734              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6735              REDUC_GROUP_SIZE.  */
6736           if (slp_reduc)
6737             {
6738               tree res, first_res, new_res;
6739
6740               /* Reduce multiple scalar results in case of SLP unrolling.  */
6741               for (j = group_size; scalar_results.iterate (j, &res);
6742                    j++)
6743                 {
6744                   first_res = scalar_results[j % group_size];
6745                   new_res = gimple_build (&stmts, code, compute_type,
6746                                           first_res, res);
6747                   scalar_results[j % group_size] = new_res;
6748                 }
6749               scalar_results.truncate (group_size);
6750               for (k = 0; k < group_size; k++)
6751                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6752                                                     scalar_results[k]);
6753             }
6754           else
6755             {
6756               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6757               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6758               scalar_results.safe_push (new_temp);
6759             }
6760
6761           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6762         }
6763
6764       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6765           && induc_val)
6766         {
6767           /* Earlier we set the initial value to be a vector if induc_val
6768              values.  Check the result and if it is induc_val then replace
6769              with the original initial value, unless induc_val is
6770              the same as initial_def already.  */
6771           tree zcompare = make_ssa_name (boolean_type_node);
6772           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6773                                              induc_val);
6774           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6775           tree initial_def = reduc_info->reduc_initial_values[0];
6776           tree tmp = make_ssa_name (new_scalar_dest);
6777           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6778                                              initial_def, new_temp);
6779           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6780           scalar_results[0] = tmp;
6781         }
6782     }
6783
6784   /* 2.5 Adjust the final result by the initial value of the reduction
6785          variable. (When such adjustment is not needed, then
6786          'adjustment_def' is zero).  For example, if code is PLUS we create:
6787          new_temp = loop_exit_def + adjustment_def  */
6788
6789   if (adjustment_def)
6790     {
6791       gcc_assert (!slp_reduc);
6792       gimple_seq stmts = NULL;
6793       if (double_reduc)
6794         {
6795           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6796           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6797           new_temp = gimple_build (&stmts, code, vectype,
6798                                    reduc_inputs[0], adjustment_def);
6799         }
6800       else
6801         {
6802           new_temp = scalar_results[0];
6803           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6804           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6805                                            adjustment_def);
6806           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6807           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6808                                    new_temp, adjustment_def);
6809           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6810         }
6811
6812       epilog_stmt = gimple_seq_last_stmt (stmts);
6813       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6814       scalar_results[0] = new_temp;
6815     }
6816
6817   /* Record this operation if it could be reused by the epilogue loop.  */
6818   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6819       && reduc_inputs.length () == 1)
6820     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6821                                            { orig_reduc_input, reduc_info });
6822
6823   if (double_reduc)
6824     loop = outer_loop;
6825
6826   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6827           phis with new adjusted scalar results, i.e., replace use <s_out0>
6828           with use <s_out4>.
6829
6830      Transform:
6831         loop_exit:
6832           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6833           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6834           v_out2 = reduce <v_out1>
6835           s_out3 = extract_field <v_out2, 0>
6836           s_out4 = adjust_result <s_out3>
6837           use <s_out0>
6838           use <s_out0>
6839
6840      into:
6841
6842         loop_exit:
6843           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6844           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6845           v_out2 = reduce <v_out1>
6846           s_out3 = extract_field <v_out2, 0>
6847           s_out4 = adjust_result <s_out3>
6848           use <s_out4>
6849           use <s_out4> */
6850
6851   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6852   for (k = 0; k < live_out_stmts.size (); k++)
6853     {
6854       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6855       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6856
6857       phis.create (3);
6858       /* Find the loop-closed-use at the loop exit of the original scalar
6859          result.  (The reduction result is expected to have two immediate uses,
6860          one at the latch block, and one at the loop exit).  For double
6861          reductions we are looking for exit phis of the outer loop.  */
6862       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6863         {
6864           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6865             {
6866               if (!is_gimple_debug (USE_STMT (use_p)))
6867                 phis.safe_push (USE_STMT (use_p));
6868             }
6869           else
6870             {
6871               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6872                 {
6873                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6874
6875                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6876                     {
6877                       if (!flow_bb_inside_loop_p (loop,
6878                                              gimple_bb (USE_STMT (phi_use_p)))
6879                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6880                         phis.safe_push (USE_STMT (phi_use_p));
6881                     }
6882                 }
6883             }
6884         }
6885
6886       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6887         {
6888           /* Replace the uses:  */
6889           orig_name = PHI_RESULT (exit_phi);
6890
6891           /* Look for a single use at the target of the skip edge.  */
6892           if (unify_with_main_loop_p)
6893             {
6894               use_operand_p use_p;
6895               gimple *user;
6896               if (!single_imm_use (orig_name, &use_p, &user))
6897                 gcc_unreachable ();
6898               orig_name = gimple_get_lhs (user);
6899             }
6900
6901           scalar_result = scalar_results[k];
6902           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6903             {
6904               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6905                 SET_USE (use_p, scalar_result);
6906               update_stmt (use_stmt);
6907             }
6908         }
6909
6910       phis.release ();
6911     }
6912 }
6913
6914 /* Return a vector of type VECTYPE that is equal to the vector select
6915    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6916    before GSI.  */
6917
6918 static tree
6919 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6920                      tree vec, tree identity)
6921 {
6922   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6923   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6924                                           mask, vec, identity);
6925   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6926   return cond;
6927 }
6928
6929 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6930    order, starting with LHS.  Insert the extraction statements before GSI and
6931    associate the new scalar SSA names with variable SCALAR_DEST.
6932    If MASK is nonzero mask the input and then operate on it unconditionally.
6933    Return the SSA name for the result.  */
6934
6935 static tree
6936 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6937                        tree_code code, tree lhs, tree vector_rhs,
6938                        tree mask)
6939 {
6940   tree vectype = TREE_TYPE (vector_rhs);
6941   tree scalar_type = TREE_TYPE (vectype);
6942   tree bitsize = TYPE_SIZE (scalar_type);
6943   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6944   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6945
6946   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6947      to perform an unconditional element-wise reduction of it.  */
6948   if (mask)
6949     {
6950       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6951                                                    "masked_vector_rhs");
6952       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6953                                                   false);
6954       tree vector_identity = build_vector_from_val (vectype, neutral_op);
6955       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6956                                              mask, vector_rhs, vector_identity);
6957       gsi_insert_before (gsi, select, GSI_SAME_STMT);
6958       vector_rhs = masked_vector_rhs;
6959     }
6960
6961   for (unsigned HOST_WIDE_INT bit_offset = 0;
6962        bit_offset < vec_size_in_bits;
6963        bit_offset += element_bitsize)
6964     {
6965       tree bitpos = bitsize_int (bit_offset);
6966       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6967                          bitsize, bitpos);
6968
6969       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6970       rhs = make_ssa_name (scalar_dest, stmt);
6971       gimple_assign_set_lhs (stmt, rhs);
6972       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6973
6974       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6975       tree new_name = make_ssa_name (scalar_dest, stmt);
6976       gimple_assign_set_lhs (stmt, new_name);
6977       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6978       lhs = new_name;
6979     }
6980   return lhs;
6981 }
6982
6983 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6984    type of the vector input.  */
6985
6986 static internal_fn
6987 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6988 {
6989   internal_fn mask_reduc_fn;
6990   internal_fn mask_len_reduc_fn;
6991
6992   switch (reduc_fn)
6993     {
6994     case IFN_FOLD_LEFT_PLUS:
6995       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6996       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6997       break;
6998
6999     default:
7000       return IFN_LAST;
7001     }
7002
7003   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7004                                       OPTIMIZE_FOR_SPEED))
7005     return mask_reduc_fn;
7006   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7007                                       OPTIMIZE_FOR_SPEED))
7008     return mask_len_reduc_fn;
7009   return IFN_LAST;
7010 }
7011
7012 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7013    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7014    statement.  CODE is the operation performed by STMT_INFO and OPS are
7015    its scalar operands.  REDUC_INDEX is the index of the operand in
7016    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7017    implements in-order reduction, or IFN_LAST if we should open-code it.
7018    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7019    that should be used to control the operation in a fully-masked loop.  */
7020
7021 static bool
7022 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7023                                stmt_vec_info stmt_info,
7024                                gimple_stmt_iterator *gsi,
7025                                gimple **vec_stmt, slp_tree slp_node,
7026                                gimple *reduc_def_stmt,
7027                                code_helper code, internal_fn reduc_fn,
7028                                tree *ops, int num_ops, tree vectype_in,
7029                                int reduc_index, vec_loop_masks *masks,
7030                                vec_loop_lens *lens)
7031 {
7032   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7033   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7034   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7035
7036   int ncopies;
7037   if (slp_node)
7038     ncopies = 1;
7039   else
7040     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7041
7042   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7043   gcc_assert (ncopies == 1);
7044
7045   bool is_cond_op = false;
7046   if (!code.is_tree_code ())
7047     {
7048       code = conditional_internal_fn_code (internal_fn (code));
7049       gcc_assert (code != ERROR_MARK);
7050       is_cond_op = true;
7051     }
7052
7053   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7054
7055   if (slp_node)
7056     {
7057       if (is_cond_op)
7058         {
7059           if (dump_enabled_p ())
7060             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7061                              "fold-left reduction on SLP not supported.\n");
7062           return false;
7063         }
7064
7065       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7066                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7067     }
7068
7069   /* The operands either come from a binary operation or an IFN_COND operation.
7070      The former is a gimple assign with binary rhs and the latter is a
7071      gimple call with four arguments.  */
7072   gcc_assert (num_ops == 2 || num_ops == 4);
7073   tree op0, opmask;
7074   if (!is_cond_op)
7075     op0 = ops[1 - reduc_index];
7076   else
7077     {
7078       op0 = ops[2 + (1 - reduc_index)];
7079       opmask = ops[0];
7080       gcc_assert (!slp_node);
7081     }
7082
7083   int group_size = 1;
7084   stmt_vec_info scalar_dest_def_info;
7085   auto_vec<tree> vec_oprnds0, vec_opmask;
7086   if (slp_node)
7087     {
7088       auto_vec<vec<tree> > vec_defs (2);
7089       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7090       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7091       vec_defs[0].release ();
7092       vec_defs[1].release ();
7093       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7094       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7095     }
7096   else
7097     {
7098       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7099                                      op0, &vec_oprnds0);
7100       scalar_dest_def_info = stmt_info;
7101
7102       /* For an IFN_COND_OP we also need the vector mask operand.  */
7103       if (is_cond_op)
7104           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7105                                          opmask, &vec_opmask);
7106     }
7107
7108   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7109   tree scalar_dest = gimple_get_lhs (sdef);
7110   tree scalar_type = TREE_TYPE (scalar_dest);
7111   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7112
7113   int vec_num = vec_oprnds0.length ();
7114   gcc_assert (vec_num == 1 || slp_node);
7115   tree vec_elem_type = TREE_TYPE (vectype_out);
7116   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7117
7118   tree vector_identity = NULL_TREE;
7119   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7120     {
7121       vector_identity = build_zero_cst (vectype_out);
7122       if (!HONOR_SIGNED_ZEROS (vectype_out))
7123         ;
7124       else
7125         {
7126           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7127           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7128                                         vector_identity);
7129         }
7130     }
7131
7132   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7133   int i;
7134   tree def0;
7135   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7136     {
7137       gimple *new_stmt;
7138       tree mask = NULL_TREE;
7139       tree len = NULL_TREE;
7140       tree bias = NULL_TREE;
7141       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7142         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7143       else if (is_cond_op)
7144         mask = vec_opmask[0];
7145       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7146         {
7147           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7148                                    i, 1);
7149           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7150           bias = build_int_cst (intQI_type_node, biasval);
7151           if (!is_cond_op)
7152             mask = build_minus_one_cst (truth_type_for (vectype_in));
7153         }
7154
7155       /* Handle MINUS by adding the negative.  */
7156       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7157         {
7158           tree negated = make_ssa_name (vectype_out);
7159           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7160           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7161           def0 = negated;
7162         }
7163
7164       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7165           && mask && mask_reduc_fn == IFN_LAST)
7166         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7167                                     vector_identity);
7168
7169       /* On the first iteration the input is simply the scalar phi
7170          result, and for subsequent iterations it is the output of
7171          the preceding operation.  */
7172       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7173         {
7174           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7175             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7176                                                    def0, mask, len, bias);
7177           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7178             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7179                                                    def0, mask);
7180           else
7181             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7182                                                    def0);
7183           /* For chained SLP reductions the output of the previous reduction
7184              operation serves as the input of the next. For the final statement
7185              the output cannot be a temporary - we reuse the original
7186              scalar destination of the last statement.  */
7187           if (i != vec_num - 1)
7188             {
7189               gimple_set_lhs (new_stmt, scalar_dest_var);
7190               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7191               gimple_set_lhs (new_stmt, reduc_var);
7192             }
7193         }
7194       else
7195         {
7196           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7197                                              tree_code (code), reduc_var, def0,
7198                                              mask);
7199           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7200           /* Remove the statement, so that we can use the same code paths
7201              as for statements that we've just created.  */
7202           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7203           gsi_remove (&tmp_gsi, true);
7204         }
7205
7206       if (i == vec_num - 1)
7207         {
7208           gimple_set_lhs (new_stmt, scalar_dest);
7209           vect_finish_replace_stmt (loop_vinfo,
7210                                     scalar_dest_def_info,
7211                                     new_stmt);
7212         }
7213       else
7214         vect_finish_stmt_generation (loop_vinfo,
7215                                      scalar_dest_def_info,
7216                                      new_stmt, gsi);
7217
7218       if (slp_node)
7219         slp_node->push_vec_def (new_stmt);
7220       else
7221         {
7222           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7223           *vec_stmt = new_stmt;
7224         }
7225     }
7226
7227   return true;
7228 }
7229
7230 /* Function is_nonwrapping_integer_induction.
7231
7232    Check if STMT_VINO (which is part of loop LOOP) both increments and
7233    does not cause overflow.  */
7234
7235 static bool
7236 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7237 {
7238   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7239   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7240   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7241   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7242   widest_int ni, max_loop_value, lhs_max;
7243   wi::overflow_type overflow = wi::OVF_NONE;
7244
7245   /* Make sure the loop is integer based.  */
7246   if (TREE_CODE (base) != INTEGER_CST
7247       || TREE_CODE (step) != INTEGER_CST)
7248     return false;
7249
7250   /* Check that the max size of the loop will not wrap.  */
7251
7252   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7253     return true;
7254
7255   if (! max_stmt_executions (loop, &ni))
7256     return false;
7257
7258   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7259                             &overflow);
7260   if (overflow)
7261     return false;
7262
7263   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7264                             TYPE_SIGN (lhs_type), &overflow);
7265   if (overflow)
7266     return false;
7267
7268   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7269           <= TYPE_PRECISION (lhs_type));
7270 }
7271
7272 /* Check if masking can be supported by inserting a conditional expression.
7273    CODE is the code for the operation.  COND_FN is the conditional internal
7274    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7275 static bool
7276 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7277                          tree vectype_in)
7278 {
7279   if (cond_fn != IFN_LAST
7280       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7281                                          OPTIMIZE_FOR_SPEED))
7282     return false;
7283
7284   if (code.is_tree_code ())
7285     switch (tree_code (code))
7286       {
7287       case DOT_PROD_EXPR:
7288       case SAD_EXPR:
7289         return true;
7290
7291       default:
7292         break;
7293       }
7294   return false;
7295 }
7296
7297 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7298    code for the operation.  VOP is the array of operands.  MASK is the loop
7299    mask.  GSI is a statement iterator used to place the new conditional
7300    expression.  */
7301 static void
7302 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7303                       gimple_stmt_iterator *gsi)
7304 {
7305   switch (tree_code (code))
7306     {
7307     case DOT_PROD_EXPR:
7308       {
7309         tree vectype = TREE_TYPE (vop[1]);
7310         tree zero = build_zero_cst (vectype);
7311         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7312         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7313                                                mask, vop[1], zero);
7314         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7315         vop[1] = masked_op1;
7316         break;
7317       }
7318
7319     case SAD_EXPR:
7320       {
7321         tree vectype = TREE_TYPE (vop[1]);
7322         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7323         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7324                                                mask, vop[1], vop[0]);
7325         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7326         vop[1] = masked_op1;
7327         break;
7328       }
7329
7330     default:
7331       gcc_unreachable ();
7332     }
7333 }
7334
7335 /* Function vectorizable_reduction.
7336
7337    Check if STMT_INFO performs a reduction operation that can be vectorized.
7338    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7339    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7340    Return true if STMT_INFO is vectorizable in this way.
7341
7342    This function also handles reduction idioms (patterns) that have been
7343    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7344    may be of this form:
7345      X = pattern_expr (arg0, arg1, ..., X)
7346    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7347    sequence that had been detected and replaced by the pattern-stmt
7348    (STMT_INFO).
7349
7350    This function also handles reduction of condition expressions, for example:
7351      for (int i = 0; i < N; i++)
7352        if (a[i] < value)
7353          last = a[i];
7354    This is handled by vectorising the loop and creating an additional vector
7355    containing the loop indexes for which "a[i] < value" was true.  In the
7356    function epilogue this is reduced to a single max value and then used to
7357    index into the vector of results.
7358
7359    In some cases of reduction patterns, the type of the reduction variable X is
7360    different than the type of the other arguments of STMT_INFO.
7361    In such cases, the vectype that is used when transforming STMT_INFO into
7362    a vector stmt is different than the vectype that is used to determine the
7363    vectorization factor, because it consists of a different number of elements
7364    than the actual number of elements that are being operated upon in parallel.
7365
7366    For example, consider an accumulation of shorts into an int accumulator.
7367    On some targets it's possible to vectorize this pattern operating on 8
7368    shorts at a time (hence, the vectype for purposes of determining the
7369    vectorization factor should be V8HI); on the other hand, the vectype that
7370    is used to create the vector form is actually V4SI (the type of the result).
7371
7372    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7373    indicates what is the actual level of parallelism (V8HI in the example), so
7374    that the right vectorization factor would be derived.  This vectype
7375    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7376    be used to create the vectorized stmt.  The right vectype for the vectorized
7377    stmt is obtained from the type of the result X:
7378       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7379
7380    This means that, contrary to "regular" reductions (or "regular" stmts in
7381    general), the following equation:
7382       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7383    does *NOT* necessarily hold for reduction patterns.  */
7384
7385 bool
7386 vectorizable_reduction (loop_vec_info loop_vinfo,
7387                         stmt_vec_info stmt_info, slp_tree slp_node,
7388                         slp_instance slp_node_instance,
7389                         stmt_vector_for_cost *cost_vec)
7390 {
7391   tree vectype_in = NULL_TREE;
7392   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7393   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7394   stmt_vec_info cond_stmt_vinfo = NULL;
7395   int i;
7396   int ncopies;
7397   bool single_defuse_cycle = false;
7398   bool nested_cycle = false;
7399   bool double_reduc = false;
7400   int vec_num;
7401   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7402   tree cond_reduc_val = NULL_TREE;
7403
7404   /* Make sure it was already recognized as a reduction computation.  */
7405   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7406       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7407       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7408     return false;
7409
7410   /* The stmt we store reduction analysis meta on.  */
7411   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7412   reduc_info->is_reduc_info = true;
7413
7414   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7415     {
7416       if (is_a <gphi *> (stmt_info->stmt))
7417         {
7418           if (slp_node)
7419             {
7420               /* We eventually need to set a vector type on invariant
7421                  arguments.  */
7422               unsigned j;
7423               slp_tree child;
7424               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7425                 if (!vect_maybe_update_slp_op_vectype
7426                        (child, SLP_TREE_VECTYPE (slp_node)))
7427                   {
7428                     if (dump_enabled_p ())
7429                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7430                                        "incompatible vector types for "
7431                                        "invariants\n");
7432                     return false;
7433                   }
7434             }
7435           /* Analysis for double-reduction is done on the outer
7436              loop PHI, nested cycles have no further restrictions.  */
7437           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7438         }
7439       else
7440         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7441       return true;
7442     }
7443
7444   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7445   stmt_vec_info phi_info = stmt_info;
7446   if (!is_a <gphi *> (stmt_info->stmt))
7447     {
7448       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7449       return true;
7450     }
7451   if (slp_node)
7452     {
7453       slp_node_instance->reduc_phis = slp_node;
7454       /* ???  We're leaving slp_node to point to the PHIs, we only
7455          need it to get at the number of vector stmts which wasn't
7456          yet initialized for the instance root.  */
7457     }
7458   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7459     {
7460       use_operand_p use_p;
7461       gimple *use_stmt;
7462       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7463                                  &use_p, &use_stmt);
7464       gcc_assert (res);
7465       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7466     }
7467
7468   /* PHIs should not participate in patterns.  */
7469   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7470   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7471
7472   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7473      and compute the reduction chain length.  Discover the real
7474      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7475   tree reduc_def
7476     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7477                              loop_latch_edge
7478                                (gimple_bb (reduc_def_phi)->loop_father));
7479   unsigned reduc_chain_length = 0;
7480   bool only_slp_reduc_chain = true;
7481   stmt_info = NULL;
7482   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7483   while (reduc_def != PHI_RESULT (reduc_def_phi))
7484     {
7485       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7486       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7487       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7488         {
7489           if (dump_enabled_p ())
7490             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491                              "reduction chain broken by patterns.\n");
7492           return false;
7493         }
7494       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7495         only_slp_reduc_chain = false;
7496       /* For epilogue generation live members of the chain need
7497          to point back to the PHI via their original stmt for
7498          info_for_reduction to work.  For SLP we need to look at
7499          all lanes here - even though we only will vectorize from
7500          the SLP node with live lane zero the other live lanes also
7501          need to be identified as part of a reduction to be able
7502          to skip code generation for them.  */
7503       if (slp_for_stmt_info)
7504         {
7505           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7506             if (STMT_VINFO_LIVE_P (s))
7507               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7508         }
7509       else if (STMT_VINFO_LIVE_P (vdef))
7510         STMT_VINFO_REDUC_DEF (def) = phi_info;
7511       gimple_match_op op;
7512       if (!gimple_extract_op (vdef->stmt, &op))
7513         {
7514           if (dump_enabled_p ())
7515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7516                              "reduction chain includes unsupported"
7517                              " statement type.\n");
7518           return false;
7519         }
7520       if (CONVERT_EXPR_CODE_P (op.code))
7521         {
7522           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7523             {
7524               if (dump_enabled_p ())
7525                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526                                  "conversion in the reduction chain.\n");
7527               return false;
7528             }
7529         }
7530       else if (!stmt_info)
7531         /* First non-conversion stmt.  */
7532         stmt_info = vdef;
7533       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7534       reduc_chain_length++;
7535       if (!stmt_info && slp_node)
7536         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7537     }
7538   /* PHIs should not participate in patterns.  */
7539   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7540
7541   if (nested_in_vect_loop_p (loop, stmt_info))
7542     {
7543       loop = loop->inner;
7544       nested_cycle = true;
7545     }
7546
7547   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7548      element.  */
7549   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7550     {
7551       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7552       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7553     }
7554   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7555     gcc_assert (slp_node
7556                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7557
7558   /* 1. Is vectorizable reduction?  */
7559   /* Not supportable if the reduction variable is used in the loop, unless
7560      it's a reduction chain.  */
7561   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7562       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7563     return false;
7564
7565   /* Reductions that are not used even in an enclosing outer-loop,
7566      are expected to be "live" (used out of the loop).  */
7567   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7568       && !STMT_VINFO_LIVE_P (stmt_info))
7569     return false;
7570
7571   /* 2. Has this been recognized as a reduction pattern?
7572
7573      Check if STMT represents a pattern that has been recognized
7574      in earlier analysis stages.  For stmts that represent a pattern,
7575      the STMT_VINFO_RELATED_STMT field records the last stmt in
7576      the original sequence that constitutes the pattern.  */
7577
7578   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7579   if (orig_stmt_info)
7580     {
7581       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7582       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7583     }
7584
7585   /* 3. Check the operands of the operation.  The first operands are defined
7586         inside the loop body. The last operand is the reduction variable,
7587         which is defined by the loop-header-phi.  */
7588
7589   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7590   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7591   gimple_match_op op;
7592   if (!gimple_extract_op (stmt_info->stmt, &op))
7593     gcc_unreachable ();
7594   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7595                             || op.code == WIDEN_SUM_EXPR
7596                             || op.code == SAD_EXPR);
7597
7598   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7599       && !SCALAR_FLOAT_TYPE_P (op.type))
7600     return false;
7601
7602   /* Do not try to vectorize bit-precision reductions.  */
7603   if (!type_has_mode_precision_p (op.type))
7604     return false;
7605
7606   /* For lane-reducing ops we're reducing the number of reduction PHIs
7607      which means the only use of that may be in the lane-reducing operation.  */
7608   if (lane_reduc_code_p
7609       && reduc_chain_length != 1
7610       && !only_slp_reduc_chain)
7611     {
7612       if (dump_enabled_p ())
7613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7614                          "lane-reducing reduction with extra stmts.\n");
7615       return false;
7616     }
7617
7618   /* All uses but the last are expected to be defined in the loop.
7619      The last use is the reduction variable.  In case of nested cycle this
7620      assumption is not true: we use reduc_index to record the index of the
7621      reduction variable.  */
7622   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7623   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7624   /* We need to skip an extra operand for COND_EXPRs with embedded
7625      comparison.  */
7626   unsigned opno_adjust = 0;
7627   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7628     opno_adjust = 1;
7629   for (i = 0; i < (int) op.num_ops; i++)
7630     {
7631       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7632       if (i == 0 && op.code == COND_EXPR)
7633         continue;
7634
7635       stmt_vec_info def_stmt_info;
7636       enum vect_def_type dt;
7637       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7638                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7639                                &vectype_op[i], &def_stmt_info))
7640         {
7641           if (dump_enabled_p ())
7642             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7643                              "use not simple.\n");
7644           return false;
7645         }
7646       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7647         continue;
7648
7649       /* For an IFN_COND_OP we might hit the reduction definition operand
7650          twice (once as definition, once as else).  */
7651       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7652         continue;
7653
7654       /* There should be only one cycle def in the stmt, the one
7655          leading to reduc_def.  */
7656       if (VECTORIZABLE_CYCLE_DEF (dt))
7657         return false;
7658
7659       if (!vectype_op[i])
7660         vectype_op[i]
7661           = get_vectype_for_scalar_type (loop_vinfo,
7662                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7663
7664       /* To properly compute ncopies we are interested in the widest
7665          non-reduction input type in case we're looking at a widening
7666          accumulation that we later handle in vect_transform_reduction.  */
7667       if (lane_reduc_code_p
7668           && vectype_op[i]
7669           && (!vectype_in
7670               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7671                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7672         vectype_in = vectype_op[i];
7673
7674       if (op.code == COND_EXPR)
7675         {
7676           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7677           if (dt == vect_constant_def)
7678             {
7679               cond_reduc_dt = dt;
7680               cond_reduc_val = op.ops[i];
7681             }
7682           if (dt == vect_induction_def
7683               && def_stmt_info
7684               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7685             {
7686               cond_reduc_dt = dt;
7687               cond_stmt_vinfo = def_stmt_info;
7688             }
7689         }
7690     }
7691   if (!vectype_in)
7692     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7693   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7694
7695   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7696   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7697   /* If we have a condition reduction, see if we can simplify it further.  */
7698   if (v_reduc_type == COND_REDUCTION)
7699     {
7700       if (slp_node)
7701         return false;
7702
7703       /* When the condition uses the reduction value in the condition, fail.  */
7704       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7705         {
7706           if (dump_enabled_p ())
7707             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7708                              "condition depends on previous iteration\n");
7709           return false;
7710         }
7711
7712       if (reduc_chain_length == 1
7713           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7714                                               OPTIMIZE_FOR_SPEED)
7715               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7716                                                  vectype_in,
7717                                                  OPTIMIZE_FOR_SPEED)))
7718         {
7719           if (dump_enabled_p ())
7720             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7721                              "optimizing condition reduction with"
7722                              " FOLD_EXTRACT_LAST.\n");
7723           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7724         }
7725       else if (cond_reduc_dt == vect_induction_def)
7726         {
7727           tree base
7728             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7729           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7730
7731           gcc_assert (TREE_CODE (base) == INTEGER_CST
7732                       && TREE_CODE (step) == INTEGER_CST);
7733           cond_reduc_val = NULL_TREE;
7734           enum tree_code cond_reduc_op_code = ERROR_MARK;
7735           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7736           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7737             ;
7738           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7739              above base; punt if base is the minimum value of the type for
7740              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7741           else if (tree_int_cst_sgn (step) == -1)
7742             {
7743               cond_reduc_op_code = MIN_EXPR;
7744               if (tree_int_cst_sgn (base) == -1)
7745                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7746               else if (tree_int_cst_lt (base,
7747                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7748                 cond_reduc_val
7749                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7750             }
7751           else
7752             {
7753               cond_reduc_op_code = MAX_EXPR;
7754               if (tree_int_cst_sgn (base) == 1)
7755                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7756               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7757                                         base))
7758                 cond_reduc_val
7759                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7760             }
7761           if (cond_reduc_val)
7762             {
7763               if (dump_enabled_p ())
7764                 dump_printf_loc (MSG_NOTE, vect_location,
7765                                  "condition expression based on "
7766                                  "integer induction.\n");
7767               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7768               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7769                 = cond_reduc_val;
7770               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7771             }
7772         }
7773       else if (cond_reduc_dt == vect_constant_def)
7774         {
7775           enum vect_def_type cond_initial_dt;
7776           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7777           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7778           if (cond_initial_dt == vect_constant_def
7779               && types_compatible_p (TREE_TYPE (cond_initial_val),
7780                                      TREE_TYPE (cond_reduc_val)))
7781             {
7782               tree e = fold_binary (LE_EXPR, boolean_type_node,
7783                                     cond_initial_val, cond_reduc_val);
7784               if (e && (integer_onep (e) || integer_zerop (e)))
7785                 {
7786                   if (dump_enabled_p ())
7787                     dump_printf_loc (MSG_NOTE, vect_location,
7788                                      "condition expression based on "
7789                                      "compile time constant.\n");
7790                   /* Record reduction code at analysis stage.  */
7791                   STMT_VINFO_REDUC_CODE (reduc_info)
7792                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7793                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7794                 }
7795             }
7796         }
7797     }
7798
7799   if (STMT_VINFO_LIVE_P (phi_info))
7800     return false;
7801
7802   if (slp_node)
7803     ncopies = 1;
7804   else
7805     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7806
7807   gcc_assert (ncopies >= 1);
7808
7809   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7810
7811   if (nested_cycle)
7812     {
7813       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7814                   == vect_double_reduction_def);
7815       double_reduc = true;
7816     }
7817
7818   /* 4.2. Check support for the epilog operation.
7819
7820           If STMT represents a reduction pattern, then the type of the
7821           reduction variable may be different than the type of the rest
7822           of the arguments.  For example, consider the case of accumulation
7823           of shorts into an int accumulator; The original code:
7824                         S1: int_a = (int) short_a;
7825           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7826
7827           was replaced with:
7828                         STMT: int_acc = widen_sum <short_a, int_acc>
7829
7830           This means that:
7831           1. The tree-code that is used to create the vector operation in the
7832              epilog code (that reduces the partial results) is not the
7833              tree-code of STMT, but is rather the tree-code of the original
7834              stmt from the pattern that STMT is replacing.  I.e, in the example
7835              above we want to use 'widen_sum' in the loop, but 'plus' in the
7836              epilog.
7837           2. The type (mode) we use to check available target support
7838              for the vector operation to be created in the *epilog*, is
7839              determined by the type of the reduction variable (in the example
7840              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7841              However the type (mode) we use to check available target support
7842              for the vector operation to be created *inside the loop*, is
7843              determined by the type of the other arguments to STMT (in the
7844              example we'd check this: optab_handler (widen_sum_optab,
7845              vect_short_mode)).
7846
7847           This is contrary to "regular" reductions, in which the types of all
7848           the arguments are the same as the type of the reduction variable.
7849           For "regular" reductions we can therefore use the same vector type
7850           (and also the same tree-code) when generating the epilog code and
7851           when generating the code inside the loop.  */
7852
7853   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7854
7855   /* If conversion might have created a conditional operation like
7856      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7857   if (orig_code.is_internal_fn ())
7858     {
7859       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7860       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7861     }
7862
7863   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7864
7865   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7866   if (reduction_type == TREE_CODE_REDUCTION)
7867     {
7868       /* Check whether it's ok to change the order of the computation.
7869          Generally, when vectorizing a reduction we change the order of the
7870          computation.  This may change the behavior of the program in some
7871          cases, so we need to check that this is ok.  One exception is when
7872          vectorizing an outer-loop: the inner-loop is executed sequentially,
7873          and therefore vectorizing reductions in the inner-loop during
7874          outer-loop vectorization is safe.  Likewise when we are vectorizing
7875          a series of reductions using SLP and the VF is one the reductions
7876          are performed in scalar order.  */
7877       if (slp_node
7878           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7879           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7880         ;
7881       else if (needs_fold_left_reduction_p (op.type, orig_code))
7882         {
7883           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7884              is not directy used in stmt.  */
7885           if (!only_slp_reduc_chain
7886               && reduc_chain_length != 1)
7887             {
7888               if (dump_enabled_p ())
7889                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7890                                  "in-order reduction chain without SLP.\n");
7891               return false;
7892             }
7893           STMT_VINFO_REDUC_TYPE (reduc_info)
7894             = reduction_type = FOLD_LEFT_REDUCTION;
7895         }
7896       else if (!commutative_binary_op_p (orig_code, op.type)
7897                || !associative_binary_op_p (orig_code, op.type))
7898         {
7899           if (dump_enabled_p ())
7900             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7901                             "reduction: not commutative/associative\n");
7902           return false;
7903         }
7904     }
7905
7906   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7907       && ncopies > 1)
7908     {
7909       if (dump_enabled_p ())
7910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7911                          "multiple types in double reduction or condition "
7912                          "reduction or fold-left reduction.\n");
7913       return false;
7914     }
7915
7916   internal_fn reduc_fn = IFN_LAST;
7917   if (reduction_type == TREE_CODE_REDUCTION
7918       || reduction_type == FOLD_LEFT_REDUCTION
7919       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7920       || reduction_type == CONST_COND_REDUCTION)
7921     {
7922       if (reduction_type == FOLD_LEFT_REDUCTION
7923           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7924           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7925         {
7926           if (reduc_fn != IFN_LAST
7927               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7928                                                   OPTIMIZE_FOR_SPEED))
7929             {
7930               if (dump_enabled_p ())
7931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7932                                  "reduc op not supported by target.\n");
7933
7934               reduc_fn = IFN_LAST;
7935             }
7936         }
7937       else
7938         {
7939           if (!nested_cycle || double_reduc)
7940             {
7941               if (dump_enabled_p ())
7942                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943                                  "no reduc code for scalar code.\n");
7944
7945               return false;
7946             }
7947         }
7948     }
7949   else if (reduction_type == COND_REDUCTION)
7950     {
7951       int scalar_precision
7952         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7953       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7954       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7955                                                 vectype_out);
7956
7957       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7958                                           OPTIMIZE_FOR_SPEED))
7959         reduc_fn = IFN_REDUC_MAX;
7960     }
7961   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7962
7963   if (reduction_type != EXTRACT_LAST_REDUCTION
7964       && (!nested_cycle || double_reduc)
7965       && reduc_fn == IFN_LAST
7966       && !nunits_out.is_constant ())
7967     {
7968       if (dump_enabled_p ())
7969         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7970                          "missing target support for reduction on"
7971                          " variable-length vectors.\n");
7972       return false;
7973     }
7974
7975   /* For SLP reductions, see if there is a neutral value we can use.  */
7976   tree neutral_op = NULL_TREE;
7977   if (slp_node)
7978     {
7979       tree initial_value = NULL_TREE;
7980       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7981         initial_value = vect_phi_initial_value (reduc_def_phi);
7982       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7983                                              orig_code, initial_value);
7984     }
7985
7986   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7987     {
7988       /* We can't support in-order reductions of code such as this:
7989
7990            for (int i = 0; i < n1; ++i)
7991              for (int j = 0; j < n2; ++j)
7992                l += a[j];
7993
7994          since GCC effectively transforms the loop when vectorizing:
7995
7996            for (int i = 0; i < n1 / VF; ++i)
7997              for (int j = 0; j < n2; ++j)
7998                for (int k = 0; k < VF; ++k)
7999                  l += a[j];
8000
8001          which is a reassociation of the original operation.  */
8002       if (dump_enabled_p ())
8003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8004                          "in-order double reduction not supported.\n");
8005
8006       return false;
8007     }
8008
8009   if (reduction_type == FOLD_LEFT_REDUCTION
8010       && slp_node
8011       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8012     {
8013       /* We cannot use in-order reductions in this case because there is
8014          an implicit reassociation of the operations involved.  */
8015       if (dump_enabled_p ())
8016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8017                          "in-order unchained SLP reductions not supported.\n");
8018       return false;
8019     }
8020
8021   /* For double reductions, and for SLP reductions with a neutral value,
8022      we construct a variable-length initial vector by loading a vector
8023      full of the neutral value and then shift-and-inserting the start
8024      values into the low-numbered elements.  */
8025   if ((double_reduc || neutral_op)
8026       && !nunits_out.is_constant ()
8027       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8028                                           vectype_out, OPTIMIZE_FOR_SPEED))
8029     {
8030       if (dump_enabled_p ())
8031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8032                          "reduction on variable-length vectors requires"
8033                          " target support for a vector-shift-and-insert"
8034                          " operation.\n");
8035       return false;
8036     }
8037
8038   /* Check extra constraints for variable-length unchained SLP reductions.  */
8039   if (slp_node
8040       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8041       && !nunits_out.is_constant ())
8042     {
8043       /* We checked above that we could build the initial vector when
8044          there's a neutral element value.  Check here for the case in
8045          which each SLP statement has its own initial value and in which
8046          that value needs to be repeated for every instance of the
8047          statement within the initial vector.  */
8048       unsigned int group_size = SLP_TREE_LANES (slp_node);
8049       if (!neutral_op
8050           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8051                                               TREE_TYPE (vectype_out)))
8052         {
8053           if (dump_enabled_p ())
8054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8055                              "unsupported form of SLP reduction for"
8056                              " variable-length vectors: cannot build"
8057                              " initial vector.\n");
8058           return false;
8059         }
8060       /* The epilogue code relies on the number of elements being a multiple
8061          of the group size.  The duplicate-and-interleave approach to setting
8062          up the initial vector does too.  */
8063       if (!multiple_p (nunits_out, group_size))
8064         {
8065           if (dump_enabled_p ())
8066             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8067                              "unsupported form of SLP reduction for"
8068                              " variable-length vectors: the vector size"
8069                              " is not a multiple of the number of results.\n");
8070           return false;
8071         }
8072     }
8073
8074   if (reduction_type == COND_REDUCTION)
8075     {
8076       widest_int ni;
8077
8078       if (! max_loop_iterations (loop, &ni))
8079         {
8080           if (dump_enabled_p ())
8081             dump_printf_loc (MSG_NOTE, vect_location,
8082                              "loop count not known, cannot create cond "
8083                              "reduction.\n");
8084           return false;
8085         }
8086       /* Convert backedges to iterations.  */
8087       ni += 1;
8088
8089       /* The additional index will be the same type as the condition.  Check
8090          that the loop can fit into this less one (because we'll use up the
8091          zero slot for when there are no matches).  */
8092       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8093       if (wi::geu_p (ni, wi::to_widest (max_index)))
8094         {
8095           if (dump_enabled_p ())
8096             dump_printf_loc (MSG_NOTE, vect_location,
8097                              "loop size is greater than data size.\n");
8098           return false;
8099         }
8100     }
8101
8102   /* In case the vectorization factor (VF) is bigger than the number
8103      of elements that we can fit in a vectype (nunits), we have to generate
8104      more than one vector stmt - i.e - we need to "unroll" the
8105      vector stmt by a factor VF/nunits.  For more details see documentation
8106      in vectorizable_operation.  */
8107
8108   /* If the reduction is used in an outer loop we need to generate
8109      VF intermediate results, like so (e.g. for ncopies=2):
8110         r0 = phi (init, r0)
8111         r1 = phi (init, r1)
8112         r0 = x0 + r0;
8113         r1 = x1 + r1;
8114     (i.e. we generate VF results in 2 registers).
8115     In this case we have a separate def-use cycle for each copy, and therefore
8116     for each copy we get the vector def for the reduction variable from the
8117     respective phi node created for this copy.
8118
8119     Otherwise (the reduction is unused in the loop nest), we can combine
8120     together intermediate results, like so (e.g. for ncopies=2):
8121         r = phi (init, r)
8122         r = x0 + r;
8123         r = x1 + r;
8124    (i.e. we generate VF/2 results in a single register).
8125    In this case for each copy we get the vector def for the reduction variable
8126    from the vectorized reduction operation generated in the previous iteration.
8127
8128    This only works when we see both the reduction PHI and its only consumer
8129    in vectorizable_reduction and there are no intermediate stmts
8130    participating.  When unrolling we want each unrolled iteration to have its
8131    own reduction accumulator since one of the main goals of unrolling a
8132    reduction is to reduce the aggregate loop-carried latency.  */
8133   if (ncopies > 1
8134       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8135       && reduc_chain_length == 1
8136       && loop_vinfo->suggested_unroll_factor == 1)
8137     single_defuse_cycle = true;
8138
8139   if (single_defuse_cycle || lane_reduc_code_p)
8140     {
8141       gcc_assert (op.code != COND_EXPR);
8142
8143       /* 4. Supportable by target?  */
8144       bool ok = true;
8145
8146       /* 4.1. check support for the operation in the loop
8147
8148          This isn't necessary for the lane reduction codes, since they
8149          can only be produced by pattern matching, and it's up to the
8150          pattern matcher to test for support.  The main reason for
8151          specifically skipping this step is to avoid rechecking whether
8152          mixed-sign dot-products can be implemented using signed
8153          dot-products.  */
8154       machine_mode vec_mode = TYPE_MODE (vectype_in);
8155       if (!lane_reduc_code_p
8156           && !directly_supported_p (op.code, vectype_in, optab_vector))
8157         {
8158           if (dump_enabled_p ())
8159             dump_printf (MSG_NOTE, "op not supported by target.\n");
8160           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8161               || !vect_can_vectorize_without_simd_p (op.code))
8162             ok = false;
8163           else
8164             if (dump_enabled_p ())
8165               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8166         }
8167
8168       if (vect_emulated_vector_p (vectype_in)
8169           && !vect_can_vectorize_without_simd_p (op.code))
8170         {
8171           if (dump_enabled_p ())
8172             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8173           return false;
8174         }
8175
8176       /* lane-reducing operations have to go through vect_transform_reduction.
8177          For the other cases try without the single cycle optimization.  */
8178       if (!ok)
8179         {
8180           if (lane_reduc_code_p)
8181             return false;
8182           else
8183             single_defuse_cycle = false;
8184         }
8185     }
8186   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8187
8188   /* If the reduction stmt is one of the patterns that have lane
8189      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8190   if ((ncopies > 1 && ! single_defuse_cycle)
8191       && lane_reduc_code_p)
8192     {
8193       if (dump_enabled_p ())
8194         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8195                          "multi def-use cycle not possible for lane-reducing "
8196                          "reduction operation\n");
8197       return false;
8198     }
8199
8200   if (slp_node
8201       && !(!single_defuse_cycle
8202            && !lane_reduc_code_p
8203            && reduction_type != FOLD_LEFT_REDUCTION))
8204     for (i = 0; i < (int) op.num_ops; i++)
8205       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8206         {
8207           if (dump_enabled_p ())
8208             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8209                              "incompatible vector types for invariants\n");
8210           return false;
8211         }
8212
8213   if (slp_node)
8214     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8215   else
8216     vec_num = 1;
8217
8218   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8219                              reduction_type, ncopies, cost_vec);
8220   /* Cost the reduction op inside the loop if transformed via
8221      vect_transform_reduction.  Otherwise this is costed by the
8222      separate vectorizable_* routines.  */
8223   if (single_defuse_cycle || lane_reduc_code_p)
8224     {
8225       int factor = 1;
8226       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8227         /* Three dot-products and a subtraction.  */
8228         factor = 4;
8229       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8230                         stmt_info, 0, vect_body);
8231     }
8232
8233   if (dump_enabled_p ()
8234       && reduction_type == FOLD_LEFT_REDUCTION)
8235     dump_printf_loc (MSG_NOTE, vect_location,
8236                      "using an in-order (fold-left) reduction.\n");
8237   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8238   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8239      reductions go through their own vectorizable_* routines.  */
8240   if (!single_defuse_cycle
8241       && !lane_reduc_code_p
8242       && reduction_type != FOLD_LEFT_REDUCTION)
8243     {
8244       stmt_vec_info tem
8245         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8246       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8247         {
8248           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8249           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8250         }
8251       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8252       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8253     }
8254   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8255     {
8256       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8257       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8258       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8259
8260       if (reduction_type != FOLD_LEFT_REDUCTION
8261           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8262           && (cond_fn == IFN_LAST
8263               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8264                                                   OPTIMIZE_FOR_SPEED)))
8265         {
8266           if (dump_enabled_p ())
8267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8268                              "can't operate on partial vectors because"
8269                              " no conditional operation is available.\n");
8270           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8271         }
8272       else if (reduction_type == FOLD_LEFT_REDUCTION
8273                && reduc_fn == IFN_LAST
8274                && !expand_vec_cond_expr_p (vectype_in,
8275                                            truth_type_for (vectype_in),
8276                                            SSA_NAME))
8277         {
8278           if (dump_enabled_p ())
8279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8280                              "can't operate on partial vectors because"
8281                              " no conditional operation is available.\n");
8282           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8283         }
8284       else if (reduction_type == FOLD_LEFT_REDUCTION
8285                && internal_fn_mask_index (reduc_fn) == -1
8286                && FLOAT_TYPE_P (vectype_in)
8287                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8288         {
8289           if (dump_enabled_p ())
8290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8291                              "can't operate on partial vectors because"
8292                              " signed zeros cannot be preserved.\n");
8293           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8294         }
8295       else
8296         {
8297           internal_fn mask_reduc_fn
8298             = get_masked_reduction_fn (reduc_fn, vectype_in);
8299
8300           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8301             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8302                                   vectype_in, 1);
8303           else
8304             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8305                                    vectype_in, NULL);
8306         }
8307     }
8308   return true;
8309 }
8310
8311 /* STMT_INFO is a dot-product reduction whose multiplication operands
8312    have different signs.  Emit a sequence to emulate the operation
8313    using a series of signed DOT_PROD_EXPRs and return the last
8314    statement generated.  VEC_DEST is the result of the vector operation
8315    and VOP lists its inputs.  */
8316
8317 static gassign *
8318 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8319                              gimple_stmt_iterator *gsi, tree vec_dest,
8320                              tree vop[3])
8321 {
8322   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8323   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8324   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8325   gimple *new_stmt;
8326
8327   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8328   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8329     std::swap (vop[0], vop[1]);
8330
8331   /* Convert all inputs to signed types.  */
8332   for (int i = 0; i < 3; ++i)
8333     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8334       {
8335         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8336         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8337         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8338         vop[i] = tmp;
8339       }
8340
8341   /* In the comments below we assume 8-bit inputs for simplicity,
8342      but the approach works for any full integer type.  */
8343
8344   /* Create a vector of -128.  */
8345   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8346   tree min_narrow = build_vector_from_val (narrow_vectype,
8347                                            min_narrow_elttype);
8348
8349   /* Create a vector of 64.  */
8350   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8351   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8352   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8353
8354   /* Emit: SUB_RES = VOP[0] - 128.  */
8355   tree sub_res = make_ssa_name (narrow_vectype);
8356   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8357   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8358
8359   /* Emit:
8360
8361        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8362        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8363        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8364
8365      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8366      Doing the two 64 * y steps first allows more time to compute x.  */
8367   tree stage1 = make_ssa_name (wide_vectype);
8368   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8369                                   vop[1], half_narrow, vop[2]);
8370   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8371
8372   tree stage2 = make_ssa_name (wide_vectype);
8373   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8374                                   vop[1], half_narrow, stage1);
8375   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8376
8377   tree stage3 = make_ssa_name (wide_vectype);
8378   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8379                                   sub_res, vop[1], stage2);
8380   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8381
8382   /* Convert STAGE3 to the reduction type.  */
8383   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8384 }
8385
8386 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8387    value.  */
8388
8389 bool
8390 vect_transform_reduction (loop_vec_info loop_vinfo,
8391                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8392                           gimple **vec_stmt, slp_tree slp_node)
8393 {
8394   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8395   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8396   int i;
8397   int ncopies;
8398   int vec_num;
8399
8400   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8401   gcc_assert (reduc_info->is_reduc_info);
8402
8403   if (nested_in_vect_loop_p (loop, stmt_info))
8404     {
8405       loop = loop->inner;
8406       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8407     }
8408
8409   gimple_match_op op;
8410   if (!gimple_extract_op (stmt_info->stmt, &op))
8411     gcc_unreachable ();
8412
8413   /* All uses but the last are expected to be defined in the loop.
8414      The last use is the reduction variable.  In case of nested cycle this
8415      assumption is not true: we use reduc_index to record the index of the
8416      reduction variable.  */
8417   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8418   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8419   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8420   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8421
8422   if (slp_node)
8423     {
8424       ncopies = 1;
8425       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8426     }
8427   else
8428     {
8429       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8430       vec_num = 1;
8431     }
8432
8433   code_helper code = canonicalize_code (op.code, op.type);
8434   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8435
8436   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8437   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8438   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8439
8440   /* Transform.  */
8441   tree new_temp = NULL_TREE;
8442   auto_vec<tree> vec_oprnds0;
8443   auto_vec<tree> vec_oprnds1;
8444   auto_vec<tree> vec_oprnds2;
8445   tree def0;
8446
8447   if (dump_enabled_p ())
8448     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8449
8450   /* FORNOW: Multiple types are not supported for condition.  */
8451   if (code == COND_EXPR)
8452     gcc_assert (ncopies == 1);
8453
8454   /* A binary COND_OP reduction must have the same definition and else
8455      value. */
8456   bool cond_fn_p = code.is_internal_fn ()
8457     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8458   if (cond_fn_p)
8459     {
8460       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8461                   || code == IFN_COND_MUL || code == IFN_COND_AND
8462                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8463       gcc_assert (op.num_ops == 4
8464                   && (op.ops[reduc_index]
8465                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8466     }
8467
8468   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8469
8470   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8471   if (reduction_type == FOLD_LEFT_REDUCTION)
8472     {
8473       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8474       gcc_assert (code.is_tree_code () || cond_fn_p);
8475       return vectorize_fold_left_reduction
8476           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8477            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8478            reduc_index, masks, lens);
8479     }
8480
8481   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8482   gcc_assert (single_defuse_cycle
8483               || code == DOT_PROD_EXPR
8484               || code == WIDEN_SUM_EXPR
8485               || code == SAD_EXPR);
8486
8487   /* Create the destination vector  */
8488   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8489   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8490
8491   /* Get NCOPIES vector definitions for all operands except the reduction
8492      definition.  */
8493   if (!cond_fn_p)
8494     {
8495       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8496                          single_defuse_cycle && reduc_index == 0
8497                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8498                          single_defuse_cycle && reduc_index == 1
8499                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8500                          op.num_ops == 3
8501                          && !(single_defuse_cycle && reduc_index == 2)
8502                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8503     }
8504   else
8505     {
8506       /* For a conditional operation pass the truth type as mask
8507          vectype.  */
8508       gcc_assert (single_defuse_cycle
8509                   && (reduc_index == 1 || reduc_index == 2));
8510       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8511                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8512                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8513                          NULL_TREE, &vec_oprnds1,
8514                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8515                          NULL_TREE, &vec_oprnds2);
8516     }
8517
8518   /* For single def-use cycles get one copy of the vectorized reduction
8519      definition.  */
8520   if (single_defuse_cycle)
8521     {
8522       gcc_assert (!slp_node);
8523       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8524                                      op.ops[reduc_index],
8525                                      reduc_index == 0 ? &vec_oprnds0
8526                                      : (reduc_index == 1 ? &vec_oprnds1
8527                                         : &vec_oprnds2));
8528     }
8529
8530   bool emulated_mixed_dot_prod
8531     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8532   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8533     {
8534       gimple *new_stmt;
8535       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8536       if (masked_loop_p && !mask_by_cond_expr)
8537         {
8538           /* No conditional ifns have been defined for dot-product yet.  */
8539           gcc_assert (code != DOT_PROD_EXPR);
8540
8541           /* Make sure that the reduction accumulator is vop[0].  */
8542           if (reduc_index == 1)
8543             {
8544               gcc_assert (commutative_binary_op_p (code, op.type));
8545               std::swap (vop[0], vop[1]);
8546             }
8547           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8548                                           vec_num * ncopies, vectype_in, i);
8549           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8550                                                     vop[0], vop[1], vop[0]);
8551           new_temp = make_ssa_name (vec_dest, call);
8552           gimple_call_set_lhs (call, new_temp);
8553           gimple_call_set_nothrow (call, true);
8554           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8555           new_stmt = call;
8556         }
8557       else
8558         {
8559           if (op.num_ops >= 3)
8560             vop[2] = vec_oprnds2[i];
8561
8562           if (masked_loop_p && mask_by_cond_expr)
8563             {
8564               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8565                                               vec_num * ncopies, vectype_in, i);
8566               build_vect_cond_expr (code, vop, mask, gsi);
8567             }
8568
8569           if (emulated_mixed_dot_prod)
8570             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8571                                                     vec_dest, vop);
8572
8573           else if (code.is_internal_fn () && !cond_fn_p)
8574             new_stmt = gimple_build_call_internal (internal_fn (code),
8575                                                    op.num_ops,
8576                                                    vop[0], vop[1], vop[2]);
8577           else if (code.is_internal_fn () && cond_fn_p)
8578             new_stmt = gimple_build_call_internal (internal_fn (code),
8579                                                    op.num_ops,
8580                                                    vop[0], vop[1], vop[2],
8581                                                    vop[1]);
8582           else
8583             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8584                                             vop[0], vop[1], vop[2]);
8585           new_temp = make_ssa_name (vec_dest, new_stmt);
8586           gimple_set_lhs (new_stmt, new_temp);
8587           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8588         }
8589
8590       if (slp_node)
8591         slp_node->push_vec_def (new_stmt);
8592       else if (single_defuse_cycle
8593                && i < ncopies - 1)
8594         {
8595           if (reduc_index == 0)
8596             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8597           else if (reduc_index == 1)
8598             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8599           else if (reduc_index == 2)
8600             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8601         }
8602       else
8603         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8604     }
8605
8606   if (!slp_node)
8607     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8608
8609   return true;
8610 }
8611
8612 /* Transform phase of a cycle PHI.  */
8613
8614 bool
8615 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8616                           stmt_vec_info stmt_info, gimple **vec_stmt,
8617                           slp_tree slp_node, slp_instance slp_node_instance)
8618 {
8619   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8620   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8621   int i;
8622   int ncopies;
8623   int j;
8624   bool nested_cycle = false;
8625   int vec_num;
8626
8627   if (nested_in_vect_loop_p (loop, stmt_info))
8628     {
8629       loop = loop->inner;
8630       nested_cycle = true;
8631     }
8632
8633   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8634   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8635   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8636   gcc_assert (reduc_info->is_reduc_info);
8637
8638   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8639       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8640     /* Leave the scalar phi in place.  */
8641     return true;
8642
8643   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8644   /* For a nested cycle we do not fill the above.  */
8645   if (!vectype_in)
8646     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8647   gcc_assert (vectype_in);
8648
8649   if (slp_node)
8650     {
8651       /* The size vect_schedule_slp_instance computes is off for us.  */
8652       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8653                                       * SLP_TREE_LANES (slp_node), vectype_in);
8654       ncopies = 1;
8655     }
8656   else
8657     {
8658       vec_num = 1;
8659       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8660     }
8661
8662   /* Check whether we should use a single PHI node and accumulate
8663      vectors to one before the backedge.  */
8664   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8665     ncopies = 1;
8666
8667   /* Create the destination vector  */
8668   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8669   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8670                                                vectype_out);
8671
8672   /* Get the loop-entry arguments.  */
8673   tree vec_initial_def = NULL_TREE;
8674   auto_vec<tree> vec_initial_defs;
8675   if (slp_node)
8676     {
8677       vec_initial_defs.reserve (vec_num);
8678       if (nested_cycle)
8679         {
8680           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8681           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8682                              &vec_initial_defs);
8683         }
8684       else
8685         {
8686           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8687           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8688           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8689
8690           unsigned int num_phis = stmts.length ();
8691           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8692             num_phis = 1;
8693           initial_values.reserve (num_phis);
8694           for (unsigned int i = 0; i < num_phis; ++i)
8695             {
8696               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8697               initial_values.quick_push (vect_phi_initial_value (this_phi));
8698             }
8699           if (vec_num == 1)
8700             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8701           if (!initial_values.is_empty ())
8702             {
8703               tree initial_value
8704                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8705               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8706               tree neutral_op
8707                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8708                                             code, initial_value);
8709               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8710                                               &vec_initial_defs, vec_num,
8711                                               stmts.length (), neutral_op);
8712             }
8713         }
8714     }
8715   else
8716     {
8717       /* Get at the scalar def before the loop, that defines the initial
8718          value of the reduction variable.  */
8719       tree initial_def = vect_phi_initial_value (phi);
8720       reduc_info->reduc_initial_values.safe_push (initial_def);
8721       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8722          and we can't use zero for induc_val, use initial_def.  Similarly
8723          for REDUC_MIN and initial_def larger than the base.  */
8724       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8725         {
8726           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8727           if (TREE_CODE (initial_def) == INTEGER_CST
8728               && !integer_zerop (induc_val)
8729               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8730                    && tree_int_cst_lt (initial_def, induc_val))
8731                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8732                       && tree_int_cst_lt (induc_val, initial_def))))
8733             {
8734               induc_val = initial_def;
8735               /* Communicate we used the initial_def to epilouge
8736                  generation.  */
8737               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8738             }
8739           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8740         }
8741       else if (nested_cycle)
8742         {
8743           /* Do not use an adjustment def as that case is not supported
8744              correctly if ncopies is not one.  */
8745           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8746                                          ncopies, initial_def,
8747                                          &vec_initial_defs);
8748         }
8749       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8750                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8751         /* Fill the initial vector with the initial scalar value.  */
8752         vec_initial_def
8753           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8754                                            initial_def, initial_def);
8755       else
8756         {
8757           if (ncopies == 1)
8758             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8759           if (!reduc_info->reduc_initial_values.is_empty ())
8760             {
8761               initial_def = reduc_info->reduc_initial_values[0];
8762               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8763               tree neutral_op
8764                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8765                                             code, initial_def);
8766               gcc_assert (neutral_op);
8767               /* Try to simplify the vector initialization by applying an
8768                  adjustment after the reduction has been performed.  */
8769               if (!reduc_info->reused_accumulator
8770                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8771                   && !operand_equal_p (neutral_op, initial_def))
8772                 {
8773                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8774                     = initial_def;
8775                   initial_def = neutral_op;
8776                 }
8777               vec_initial_def
8778                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8779                                                  initial_def, neutral_op);
8780             }
8781         }
8782     }
8783
8784   if (vec_initial_def)
8785     {
8786       vec_initial_defs.create (ncopies);
8787       for (i = 0; i < ncopies; ++i)
8788         vec_initial_defs.quick_push (vec_initial_def);
8789     }
8790
8791   if (auto *accumulator = reduc_info->reused_accumulator)
8792     {
8793       tree def = accumulator->reduc_input;
8794       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8795         {
8796           unsigned int nreduc;
8797           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8798                                             (TREE_TYPE (def)),
8799                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8800                                           &nreduc);
8801           gcc_assert (res);
8802           gimple_seq stmts = NULL;
8803           /* Reduce the single vector to a smaller one.  */
8804           if (nreduc != 1)
8805             {
8806               /* Perform the reduction in the appropriate type.  */
8807               tree rvectype = vectype_out;
8808               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8809                                               TREE_TYPE (TREE_TYPE (def))))
8810                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8811                                               TYPE_VECTOR_SUBPARTS
8812                                                 (vectype_out));
8813               def = vect_create_partial_epilog (def, rvectype,
8814                                                 STMT_VINFO_REDUC_CODE
8815                                                   (reduc_info),
8816                                                 &stmts);
8817             }
8818           /* The epilogue loop might use a different vector mode, like
8819              VNx2DI vs. V2DI.  */
8820           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8821             {
8822               tree reduc_type = build_vector_type_for_mode
8823                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8824               def = gimple_convert (&stmts, reduc_type, def);
8825             }
8826           /* Adjust the input so we pick up the partially reduced value
8827              for the skip edge in vect_create_epilog_for_reduction.  */
8828           accumulator->reduc_input = def;
8829           /* And the reduction could be carried out using a different sign.  */
8830           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8831             def = gimple_convert (&stmts, vectype_out, def);
8832           if (loop_vinfo->main_loop_edge)
8833             {
8834               /* While we'd like to insert on the edge this will split
8835                  blocks and disturb bookkeeping, we also will eventually
8836                  need this on the skip edge.  Rely on sinking to
8837                  fixup optimal placement and insert in the pred.  */
8838               gimple_stmt_iterator gsi
8839                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8840               /* Insert before a cond that eventually skips the
8841                  epilogue.  */
8842               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8843                 gsi_prev (&gsi);
8844               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8845             }
8846           else
8847             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8848                                               stmts);
8849         }
8850       if (loop_vinfo->main_loop_edge)
8851         vec_initial_defs[0]
8852           = vect_get_main_loop_result (loop_vinfo, def,
8853                                        vec_initial_defs[0]);
8854       else
8855         vec_initial_defs.safe_push (def);
8856     }
8857
8858   /* Generate the reduction PHIs upfront.  */
8859   for (i = 0; i < vec_num; i++)
8860     {
8861       tree vec_init_def = vec_initial_defs[i];
8862       for (j = 0; j < ncopies; j++)
8863         {
8864           /* Create the reduction-phi that defines the reduction
8865              operand.  */
8866           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8867
8868           /* Set the loop-entry arg of the reduction-phi.  */
8869           if (j != 0 && nested_cycle)
8870             vec_init_def = vec_initial_defs[j];
8871           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8872                        UNKNOWN_LOCATION);
8873
8874           /* The loop-latch arg is set in epilogue processing.  */
8875
8876           if (slp_node)
8877             slp_node->push_vec_def (new_phi);
8878           else
8879             {
8880               if (j == 0)
8881                 *vec_stmt = new_phi;
8882               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8883             }
8884         }
8885     }
8886
8887   return true;
8888 }
8889
8890 /* Vectorizes LC PHIs.  */
8891
8892 bool
8893 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8894                      stmt_vec_info stmt_info, gimple **vec_stmt,
8895                      slp_tree slp_node)
8896 {
8897   if (!loop_vinfo
8898       || !is_a <gphi *> (stmt_info->stmt)
8899       || gimple_phi_num_args (stmt_info->stmt) != 1)
8900     return false;
8901
8902   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8903       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8904     return false;
8905
8906   if (!vec_stmt) /* transformation not required.  */
8907     {
8908       /* Deal with copies from externs or constants that disguise as
8909          loop-closed PHI nodes (PR97886).  */
8910       if (slp_node
8911           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8912                                                 SLP_TREE_VECTYPE (slp_node)))
8913         {
8914           if (dump_enabled_p ())
8915             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8916                              "incompatible vector types for invariants\n");
8917           return false;
8918         }
8919       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8920       return true;
8921     }
8922
8923   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8924   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8925   basic_block bb = gimple_bb (stmt_info->stmt);
8926   edge e = single_pred_edge (bb);
8927   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8928   auto_vec<tree> vec_oprnds;
8929   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8930                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8931                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8932   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8933     {
8934       /* Create the vectorized LC PHI node.  */
8935       gphi *new_phi = create_phi_node (vec_dest, bb);
8936       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8937       if (slp_node)
8938         slp_node->push_vec_def (new_phi);
8939       else
8940         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8941     }
8942   if (!slp_node)
8943     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8944
8945   return true;
8946 }
8947
8948 /* Vectorizes PHIs.  */
8949
8950 bool
8951 vectorizable_phi (vec_info *,
8952                   stmt_vec_info stmt_info, gimple **vec_stmt,
8953                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8954 {
8955   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8956     return false;
8957
8958   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8959     return false;
8960
8961   tree vectype = SLP_TREE_VECTYPE (slp_node);
8962
8963   if (!vec_stmt) /* transformation not required.  */
8964     {
8965       slp_tree child;
8966       unsigned i;
8967       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8968         if (!child)
8969           {
8970             if (dump_enabled_p ())
8971               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8972                                "PHI node with unvectorized backedge def\n");
8973             return false;
8974           }
8975         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8976           {
8977             if (dump_enabled_p ())
8978               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8979                                "incompatible vector types for invariants\n");
8980             return false;
8981           }
8982         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8983                  && !useless_type_conversion_p (vectype,
8984                                                 SLP_TREE_VECTYPE (child)))
8985           {
8986             /* With bools we can have mask and non-mask precision vectors
8987                or different non-mask precisions.  while pattern recog is
8988                supposed to guarantee consistency here bugs in it can cause
8989                mismatches (PR103489 and PR103800 for example).
8990                Deal with them here instead of ICEing later.  */
8991             if (dump_enabled_p ())
8992               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8993                                "incompatible vector type setup from "
8994                                "bool pattern detection\n");
8995             return false;
8996           }
8997
8998       /* For single-argument PHIs assume coalescing which means zero cost
8999          for the scalar and the vector PHIs.  This avoids artificially
9000          favoring the vector path (but may pessimize it in some cases).  */
9001       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9002         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9003                           vector_stmt, stmt_info, vectype, 0, vect_body);
9004       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9005       return true;
9006     }
9007
9008   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9009   basic_block bb = gimple_bb (stmt_info->stmt);
9010   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9011   auto_vec<gphi *> new_phis;
9012   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9013     {
9014       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9015
9016       /* Skip not yet vectorized defs.  */
9017       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9018           && SLP_TREE_VEC_DEFS (child).is_empty ())
9019         continue;
9020
9021       auto_vec<tree> vec_oprnds;
9022       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9023       if (!new_phis.exists ())
9024         {
9025           new_phis.create (vec_oprnds.length ());
9026           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9027             {
9028               /* Create the vectorized LC PHI node.  */
9029               new_phis.quick_push (create_phi_node (vec_dest, bb));
9030               slp_node->push_vec_def (new_phis[j]);
9031             }
9032         }
9033       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9034       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9035         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9036     }
9037   /* We should have at least one already vectorized child.  */
9038   gcc_assert (new_phis.exists ());
9039
9040   return true;
9041 }
9042
9043 /* Vectorizes first order recurrences.  An overview of the transformation
9044    is described below. Suppose we have the following loop.
9045
9046      int t = 0;
9047      for (int i = 0; i < n; ++i)
9048        {
9049          b[i] = a[i] - t;
9050          t = a[i];
9051        }
9052
9053    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9054    looks (simplified) like:
9055
9056     scalar.preheader:
9057       init = 0;
9058
9059     scalar.body:
9060       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9061       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9062       _1 = a[i]
9063       b[i] = _1 - _2
9064       if (i < n) goto scalar.body
9065
9066    In this example, _2 is a recurrence because it's value depends on the
9067    previous iteration.  We vectorize this as (VF = 4)
9068
9069     vector.preheader:
9070       vect_init = vect_cst(..., ..., ..., 0)
9071
9072     vector.body
9073       i = PHI <0(vector.preheader), i+4(vector.body)>
9074       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9075       vect_2 = a[i, i+1, i+2, i+3];
9076       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9077       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9078       if (..) goto vector.body
9079
9080    In this function, vectorizable_recurr, we code generate both the
9081    vector PHI node and the permute since those together compute the
9082    vectorized value of the scalar PHI.  We do not yet have the
9083    backedge value to fill in there nor into the vec_perm.  Those
9084    are filled in maybe_set_vectorized_backedge_value and
9085    vect_schedule_scc.
9086
9087    TODO:  Since the scalar loop does not have a use of the recurrence
9088    outside of the loop the natural way to implement peeling via
9089    vectorizing the live value doesn't work.  For now peeling of loops
9090    with a recurrence is not implemented.  For SLP the supported cases
9091    are restricted to those requiring a single vector recurrence PHI.  */
9092
9093 bool
9094 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9095                      gimple **vec_stmt, slp_tree slp_node,
9096                      stmt_vector_for_cost *cost_vec)
9097 {
9098   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9099     return false;
9100
9101   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9102
9103   /* So far we only support first-order recurrence auto-vectorization.  */
9104   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9105     return false;
9106
9107   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9108   unsigned ncopies;
9109   if (slp_node)
9110     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9111   else
9112     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9113   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9114   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9115   /* We need to be able to make progress with a single vector.  */
9116   if (maybe_gt (dist * 2, nunits))
9117     {
9118       if (dump_enabled_p ())
9119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9120                          "first order recurrence exceeds half of "
9121                          "a vector\n");
9122       return false;
9123     }
9124
9125   /* First-order recurrence autovectorization needs to handle permutation
9126      with indices = [nunits-1, nunits, nunits+1, ...].  */
9127   vec_perm_builder sel (nunits, 1, 3);
9128   for (int i = 0; i < 3; ++i)
9129     sel.quick_push (nunits - dist + i);
9130   vec_perm_indices indices (sel, 2, nunits);
9131
9132   if (!vec_stmt) /* transformation not required.  */
9133     {
9134       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9135                                  indices))
9136         return false;
9137
9138       if (slp_node)
9139         {
9140           /* We eventually need to set a vector type on invariant
9141              arguments.  */
9142           unsigned j;
9143           slp_tree child;
9144           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9145             if (!vect_maybe_update_slp_op_vectype
9146                   (child, SLP_TREE_VECTYPE (slp_node)))
9147               {
9148                 if (dump_enabled_p ())
9149                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9150                                    "incompatible vector types for "
9151                                    "invariants\n");
9152                 return false;
9153               }
9154         }
9155       /* The recurrence costs the initialization vector and one permute
9156          for each copy.  */
9157       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9158                                                  stmt_info, 0, vect_prologue);
9159       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9160                                                stmt_info, 0, vect_body);
9161       if (dump_enabled_p ())
9162         dump_printf_loc (MSG_NOTE, vect_location,
9163                          "vectorizable_recurr: inside_cost = %d, "
9164                          "prologue_cost = %d .\n", inside_cost,
9165                          prologue_cost);
9166
9167       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9168       return true;
9169     }
9170
9171   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9172   basic_block bb = gimple_bb (phi);
9173   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9174   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9175     {
9176       gimple_seq stmts = NULL;
9177       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9178       gsi_insert_seq_on_edge_immediate (pe, stmts);
9179     }
9180   tree vec_init = build_vector_from_val (vectype, preheader);
9181   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9182
9183   /* Create the vectorized first-order PHI node.  */
9184   tree vec_dest = vect_get_new_vect_var (vectype,
9185                                          vect_simple_var, "vec_recur_");
9186   gphi *new_phi = create_phi_node (vec_dest, bb);
9187   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9188
9189   /* Insert shuffles the first-order recurrence autovectorization.
9190        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9191   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9192
9193   /* Insert the required permute after the latch definition.  The
9194      second and later operands are tentative and will be updated when we have
9195      vectorized the latch definition.  */
9196   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9197   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9198   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9199   gsi_next (&gsi2);
9200
9201   for (unsigned i = 0; i < ncopies; ++i)
9202     {
9203       vec_dest = make_ssa_name (vectype);
9204       gassign *vperm
9205           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9206                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9207                                  NULL, perm);
9208       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9209
9210       if (slp_node)
9211         slp_node->push_vec_def (vperm);
9212       else
9213         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9214     }
9215
9216   if (!slp_node)
9217     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9218   return true;
9219 }
9220
9221 /* Return true if VECTYPE represents a vector that requires lowering
9222    by the vector lowering pass.  */
9223
9224 bool
9225 vect_emulated_vector_p (tree vectype)
9226 {
9227   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9228           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9229               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9230 }
9231
9232 /* Return true if we can emulate CODE on an integer mode representation
9233    of a vector.  */
9234
9235 bool
9236 vect_can_vectorize_without_simd_p (tree_code code)
9237 {
9238   switch (code)
9239     {
9240     case PLUS_EXPR:
9241     case MINUS_EXPR:
9242     case NEGATE_EXPR:
9243     case BIT_AND_EXPR:
9244     case BIT_IOR_EXPR:
9245     case BIT_XOR_EXPR:
9246     case BIT_NOT_EXPR:
9247       return true;
9248
9249     default:
9250       return false;
9251     }
9252 }
9253
9254 /* Likewise, but taking a code_helper.  */
9255
9256 bool
9257 vect_can_vectorize_without_simd_p (code_helper code)
9258 {
9259   return (code.is_tree_code ()
9260           && vect_can_vectorize_without_simd_p (tree_code (code)));
9261 }
9262
9263 /* Create vector init for vectorized iv.  */
9264 static tree
9265 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9266                                tree step_expr, poly_uint64 nunits,
9267                                tree vectype,
9268                                enum vect_induction_op_type induction_type)
9269 {
9270   unsigned HOST_WIDE_INT const_nunits;
9271   tree vec_shift, vec_init, new_name;
9272   unsigned i;
9273   tree itype = TREE_TYPE (vectype);
9274
9275   /* iv_loop is the loop to be vectorized. Create:
9276      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9277   new_name = gimple_convert (stmts, itype, init_expr);
9278   switch (induction_type)
9279     {
9280     case vect_step_op_shr:
9281     case vect_step_op_shl:
9282       /* Build the Initial value from shift_expr.  */
9283       vec_init = gimple_build_vector_from_val (stmts,
9284                                                vectype,
9285                                                new_name);
9286       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9287                                 build_zero_cst (itype), step_expr);
9288       vec_init = gimple_build (stmts,
9289                                (induction_type == vect_step_op_shr
9290                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9291                                vectype, vec_init, vec_shift);
9292       break;
9293
9294     case vect_step_op_neg:
9295       {
9296         vec_init = gimple_build_vector_from_val (stmts,
9297                                                  vectype,
9298                                                  new_name);
9299         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9300                                      vectype, vec_init);
9301         /* The encoding has 2 interleaved stepped patterns.  */
9302         vec_perm_builder sel (nunits, 2, 3);
9303         sel.quick_grow (6);
9304         for (i = 0; i < 3; i++)
9305           {
9306             sel[2 * i] = i;
9307             sel[2 * i + 1] = i + nunits;
9308           }
9309         vec_perm_indices indices (sel, 2, nunits);
9310         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9311            fail when vec_init is const vector. In that situation vec_perm is not
9312            really needed.  */
9313         tree perm_mask_even
9314           = vect_gen_perm_mask_any (vectype, indices);
9315         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9316                                  vectype,
9317                                  vec_init, vec_neg,
9318                                  perm_mask_even);
9319       }
9320       break;
9321
9322     case vect_step_op_mul:
9323       {
9324         /* Use unsigned mult to avoid UD integer overflow.  */
9325         gcc_assert (nunits.is_constant (&const_nunits));
9326         tree utype = unsigned_type_for (itype);
9327         tree uvectype = build_vector_type (utype,
9328                                            TYPE_VECTOR_SUBPARTS (vectype));
9329         new_name = gimple_convert (stmts, utype, new_name);
9330         vec_init = gimple_build_vector_from_val (stmts,
9331                                                  uvectype,
9332                                                  new_name);
9333         tree_vector_builder elts (uvectype, const_nunits, 1);
9334         tree elt_step = build_one_cst (utype);
9335
9336         elts.quick_push (elt_step);
9337         for (i = 1; i < const_nunits; i++)
9338           {
9339             /* Create: new_name_i = new_name + step_expr.  */
9340             elt_step = gimple_build (stmts, MULT_EXPR,
9341                                      utype, elt_step, step_expr);
9342             elts.quick_push (elt_step);
9343           }
9344         /* Create a vector from [new_name_0, new_name_1, ...,
9345            new_name_nunits-1].  */
9346         tree vec_mul = gimple_build_vector (stmts, &elts);
9347         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9348                                  vec_init, vec_mul);
9349         vec_init = gimple_convert (stmts, vectype, vec_init);
9350       }
9351       break;
9352
9353     default:
9354       gcc_unreachable ();
9355     }
9356
9357   return vec_init;
9358 }
9359
9360 /* Peel init_expr by skip_niter for induction_type.  */
9361 tree
9362 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9363                              tree skip_niters, tree step_expr,
9364                              enum vect_induction_op_type induction_type)
9365 {
9366   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9367   tree type = TREE_TYPE (init_expr);
9368   unsigned prec = TYPE_PRECISION (type);
9369   switch (induction_type)
9370     {
9371     case vect_step_op_neg:
9372       if (TREE_INT_CST_LOW (skip_niters) % 2)
9373         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9374       /* else no change.  */
9375       break;
9376
9377     case vect_step_op_shr:
9378     case vect_step_op_shl:
9379       skip_niters = gimple_convert (stmts, type, skip_niters);
9380       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9381       /* When shift mount >= precision, need to avoid UD.
9382          In the original loop, there's no UD, and according to semantic,
9383          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9384       if (!tree_fits_uhwi_p (step_expr)
9385           || tree_to_uhwi (step_expr) >= prec)
9386         {
9387           if (induction_type == vect_step_op_shl
9388               || TYPE_UNSIGNED (type))
9389             init_expr = build_zero_cst (type);
9390           else
9391             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9392                                       init_expr,
9393                                       wide_int_to_tree (type, prec - 1));
9394         }
9395       else
9396         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9397                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9398                                   type, init_expr, step_expr);
9399       break;
9400
9401     case vect_step_op_mul:
9402       {
9403         tree utype = unsigned_type_for (type);
9404         init_expr = gimple_convert (stmts, utype, init_expr);
9405         wide_int skipn = wi::to_wide (skip_niters);
9406         wide_int begin = wi::to_wide (step_expr);
9407         auto_mpz base, exp, mod, res;
9408         wi::to_mpz (begin, base, TYPE_SIGN (type));
9409         wi::to_mpz (skipn, exp, UNSIGNED);
9410         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9411         mpz_powm (res, base, exp, mod);
9412         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9413         tree mult_expr = wide_int_to_tree (utype, begin);
9414         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9415                                   init_expr, mult_expr);
9416         init_expr = gimple_convert (stmts, type, init_expr);
9417       }
9418       break;
9419
9420     default:
9421       gcc_unreachable ();
9422     }
9423
9424   return init_expr;
9425 }
9426
9427 /* Create vector step for vectorized iv.  */
9428 static tree
9429 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9430                                poly_uint64 vf,
9431                                enum vect_induction_op_type induction_type)
9432 {
9433   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9434   tree new_name = NULL;
9435   /* Step should be pow (step, vf) for mult induction.  */
9436   if (induction_type == vect_step_op_mul)
9437     {
9438       gcc_assert (vf.is_constant ());
9439       wide_int begin = wi::to_wide (step_expr);
9440
9441       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9442         begin = wi::mul (begin, wi::to_wide (step_expr));
9443
9444       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9445     }
9446   else if (induction_type == vect_step_op_neg)
9447     /* Do nothing.  */
9448     ;
9449   else
9450     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9451                              expr, step_expr);
9452   return new_name;
9453 }
9454
9455 static tree
9456 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9457                                    stmt_vec_info stmt_info,
9458                                    tree new_name, tree vectype,
9459                                    enum vect_induction_op_type induction_type)
9460 {
9461   /* No step is needed for neg induction.  */
9462   if (induction_type == vect_step_op_neg)
9463     return NULL;
9464
9465   tree t = unshare_expr (new_name);
9466   gcc_assert (CONSTANT_CLASS_P (new_name)
9467               || TREE_CODE (new_name) == SSA_NAME);
9468   tree new_vec = build_vector_from_val (vectype, t);
9469   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9470                                     new_vec, vectype, NULL);
9471   return vec_step;
9472 }
9473
9474 /* Update vectorized iv with vect_step, induc_def is init.  */
9475 static tree
9476 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9477                           tree induc_def, tree vec_step,
9478                           enum vect_induction_op_type induction_type)
9479 {
9480   tree vec_def = induc_def;
9481   switch (induction_type)
9482     {
9483     case vect_step_op_mul:
9484       {
9485         /* Use unsigned mult to avoid UD integer overflow.  */
9486         tree uvectype
9487           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9488                                TYPE_VECTOR_SUBPARTS (vectype));
9489         vec_def = gimple_convert (stmts, uvectype, vec_def);
9490         vec_step = gimple_convert (stmts, uvectype, vec_step);
9491         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9492                                 vec_def, vec_step);
9493         vec_def = gimple_convert (stmts, vectype, vec_def);
9494       }
9495       break;
9496
9497     case vect_step_op_shr:
9498       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9499                               vec_def, vec_step);
9500       break;
9501
9502     case vect_step_op_shl:
9503       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9504                               vec_def, vec_step);
9505       break;
9506     case vect_step_op_neg:
9507       vec_def = induc_def;
9508       /* Do nothing.  */
9509       break;
9510     default:
9511       gcc_unreachable ();
9512     }
9513
9514   return vec_def;
9515
9516 }
9517
9518 /* Function vectorizable_induction
9519
9520    Check if STMT_INFO performs an nonlinear induction computation that can be
9521    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9522    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9523    basic block.
9524    Return true if STMT_INFO is vectorizable in this way.  */
9525
9526 static bool
9527 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9528                                   stmt_vec_info stmt_info,
9529                                   gimple **vec_stmt, slp_tree slp_node,
9530                                   stmt_vector_for_cost *cost_vec)
9531 {
9532   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9533   unsigned ncopies;
9534   bool nested_in_vect_loop = false;
9535   class loop *iv_loop;
9536   tree vec_def;
9537   edge pe = loop_preheader_edge (loop);
9538   basic_block new_bb;
9539   tree vec_init, vec_step;
9540   tree new_name;
9541   gimple *new_stmt;
9542   gphi *induction_phi;
9543   tree induc_def, vec_dest;
9544   tree init_expr, step_expr;
9545   tree niters_skip;
9546   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9547   unsigned i;
9548   gimple_stmt_iterator si;
9549
9550   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9551
9552   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9553   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9554   enum vect_induction_op_type induction_type
9555     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9556
9557   gcc_assert (induction_type > vect_step_op_add);
9558
9559   if (slp_node)
9560     ncopies = 1;
9561   else
9562     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9563   gcc_assert (ncopies >= 1);
9564
9565   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9566   if (nested_in_vect_loop_p (loop, stmt_info))
9567     {
9568       if (dump_enabled_p ())
9569         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9570                          "nonlinear induction in nested loop.\n");
9571       return false;
9572     }
9573
9574   iv_loop = loop;
9575   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9576
9577   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9578      update for each iv and a permutation to generate wanted vector iv.  */
9579   if (slp_node)
9580     {
9581       if (dump_enabled_p ())
9582         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9583                          "SLP induction not supported for nonlinear"
9584                          " induction.\n");
9585       return false;
9586     }
9587
9588   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9589     {
9590       if (dump_enabled_p ())
9591         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9592                          "floating point nonlinear induction vectorization"
9593                          " not supported.\n");
9594       return false;
9595     }
9596
9597   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9598   init_expr = vect_phi_initial_value (phi);
9599   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9600               && TREE_CODE (step_expr) == INTEGER_CST);
9601   /* step_expr should be aligned with init_expr,
9602      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9603   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9604
9605   if (TREE_CODE (init_expr) == INTEGER_CST)
9606     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9607   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9608     {
9609       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9610       if (dump_enabled_p ())
9611         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9612                          "nonlinear induction vectorization failed:"
9613                          " component type of vectype is not a nop conversion"
9614                          " from type of init_expr.\n");
9615       return false;
9616     }
9617
9618   switch (induction_type)
9619     {
9620     case vect_step_op_neg:
9621       if (TREE_CODE (init_expr) != INTEGER_CST
9622           && TREE_CODE (init_expr) != REAL_CST)
9623         {
9624           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9625           if (!directly_supported_p (NEGATE_EXPR, vectype))
9626             return false;
9627
9628           /* The encoding has 2 interleaved stepped patterns.  */
9629           vec_perm_builder sel (nunits, 2, 3);
9630           machine_mode mode = TYPE_MODE (vectype);
9631           sel.quick_grow (6);
9632           for (i = 0; i < 3; i++)
9633             {
9634               sel[i * 2] = i;
9635               sel[i * 2 + 1] = i + nunits;
9636             }
9637           vec_perm_indices indices (sel, 2, nunits);
9638           if (!can_vec_perm_const_p (mode, mode, indices))
9639             return false;
9640         }
9641       break;
9642
9643     case vect_step_op_mul:
9644       {
9645         /* Check for backend support of MULT_EXPR.  */
9646         if (!directly_supported_p (MULT_EXPR, vectype))
9647           return false;
9648
9649         /* ?? How to construct vector step for variable number vector.
9650            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9651         if (!vf.is_constant ())
9652           return false;
9653       }
9654       break;
9655
9656     case vect_step_op_shr:
9657       /* Check for backend support of RSHIFT_EXPR.  */
9658       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9659         return false;
9660
9661       /* Don't shift more than type precision to avoid UD.  */
9662       if (!tree_fits_uhwi_p (step_expr)
9663           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9664                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9665         return false;
9666       break;
9667
9668     case vect_step_op_shl:
9669       /* Check for backend support of RSHIFT_EXPR.  */
9670       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9671         return false;
9672
9673       /* Don't shift more than type precision to avoid UD.  */
9674       if (!tree_fits_uhwi_p (step_expr)
9675           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9676                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9677         return false;
9678
9679       break;
9680
9681     default:
9682       gcc_unreachable ();
9683     }
9684
9685   if (!vec_stmt) /* transformation not required.  */
9686     {
9687       unsigned inside_cost = 0, prologue_cost = 0;
9688       /* loop cost for vec_loop. Neg induction doesn't have any
9689          inside_cost.  */
9690       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9691                                       stmt_info, 0, vect_body);
9692
9693       /* loop cost for vec_loop. Neg induction doesn't have any
9694          inside_cost.  */
9695       if (induction_type == vect_step_op_neg)
9696         inside_cost = 0;
9697
9698       /* prologue cost for vec_init and vec_step.  */
9699       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9700                                         stmt_info, 0, vect_prologue);
9701
9702       if (dump_enabled_p ())
9703         dump_printf_loc (MSG_NOTE, vect_location,
9704                          "vect_model_induction_cost: inside_cost = %d, "
9705                          "prologue_cost = %d. \n", inside_cost,
9706                          prologue_cost);
9707
9708       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9709       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9710       return true;
9711     }
9712
9713   /* Transform.  */
9714
9715   /* Compute a vector variable, initialized with the first VF values of
9716      the induction variable.  E.g., for an iv with IV_PHI='X' and
9717      evolution S, for a vector of 4 units, we want to compute:
9718      [X, X + S, X + 2*S, X + 3*S].  */
9719
9720   if (dump_enabled_p ())
9721     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9722
9723   pe = loop_preheader_edge (iv_loop);
9724   /* Find the first insertion point in the BB.  */
9725   basic_block bb = gimple_bb (phi);
9726   si = gsi_after_labels (bb);
9727
9728   gimple_seq stmts = NULL;
9729
9730   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9731   /* If we are using the loop mask to "peel" for alignment then we need
9732      to adjust the start value here.  */
9733   if (niters_skip != NULL_TREE)
9734     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9735                                              step_expr, induction_type);
9736
9737   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9738                                             step_expr, nunits, vectype,
9739                                             induction_type);
9740   if (stmts)
9741     {
9742       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9743       gcc_assert (!new_bb);
9744     }
9745
9746   stmts = NULL;
9747   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9748                                             vf, induction_type);
9749   if (stmts)
9750     {
9751       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9752       gcc_assert (!new_bb);
9753     }
9754
9755   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9756                                                 new_name, vectype,
9757                                                 induction_type);
9758   /* Create the following def-use cycle:
9759      loop prolog:
9760      vec_init = ...
9761      vec_step = ...
9762      loop:
9763      vec_iv = PHI <vec_init, vec_loop>
9764      ...
9765      STMT
9766      ...
9767      vec_loop = vec_iv + vec_step;  */
9768
9769   /* Create the induction-phi that defines the induction-operand.  */
9770   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9771   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9772   induc_def = PHI_RESULT (induction_phi);
9773
9774   /* Create the iv update inside the loop.  */
9775   stmts = NULL;
9776   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9777                                       induc_def, vec_step,
9778                                       induction_type);
9779
9780   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9781   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9782
9783   /* Set the arguments of the phi node:  */
9784   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9785   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9786                UNKNOWN_LOCATION);
9787
9788   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9789   *vec_stmt = induction_phi;
9790
9791   /* In case that vectorization factor (VF) is bigger than the number
9792      of elements that we can fit in a vectype (nunits), we have to generate
9793      more than one vector stmt - i.e - we need to "unroll" the
9794      vector stmt by a factor VF/nunits.  For more details see documentation
9795      in vectorizable_operation.  */
9796
9797   if (ncopies > 1)
9798     {
9799       stmts = NULL;
9800       /* FORNOW. This restriction should be relaxed.  */
9801       gcc_assert (!nested_in_vect_loop);
9802
9803       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9804                                                 nunits, induction_type);
9805
9806       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9807                                                     new_name, vectype,
9808                                                     induction_type);
9809       vec_def = induc_def;
9810       for (i = 1; i < ncopies; i++)
9811         {
9812           /* vec_i = vec_prev + vec_step.  */
9813           stmts = NULL;
9814           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9815                                               vec_def, vec_step,
9816                                               induction_type);
9817           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9818           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9819           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9820         }
9821     }
9822
9823   if (dump_enabled_p ())
9824     dump_printf_loc (MSG_NOTE, vect_location,
9825                      "transform induction: created def-use cycle: %G%G",
9826                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9827
9828   return true;
9829 }
9830
9831 /* Function vectorizable_induction
9832
9833    Check if STMT_INFO performs an induction computation that can be vectorized.
9834    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9835    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9836    Return true if STMT_INFO is vectorizable in this way.  */
9837
9838 bool
9839 vectorizable_induction (loop_vec_info loop_vinfo,
9840                         stmt_vec_info stmt_info,
9841                         gimple **vec_stmt, slp_tree slp_node,
9842                         stmt_vector_for_cost *cost_vec)
9843 {
9844   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9845   unsigned ncopies;
9846   bool nested_in_vect_loop = false;
9847   class loop *iv_loop;
9848   tree vec_def;
9849   edge pe = loop_preheader_edge (loop);
9850   basic_block new_bb;
9851   tree new_vec, vec_init, vec_step, t;
9852   tree new_name;
9853   gimple *new_stmt;
9854   gphi *induction_phi;
9855   tree induc_def, vec_dest;
9856   tree init_expr, step_expr;
9857   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9858   unsigned i;
9859   tree expr;
9860   gimple_stmt_iterator si;
9861   enum vect_induction_op_type induction_type
9862     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9863
9864   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9865   if (!phi)
9866     return false;
9867
9868   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9869     return false;
9870
9871   /* Make sure it was recognized as induction computation.  */
9872   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9873     return false;
9874
9875   /* Handle nonlinear induction in a separate place.  */
9876   if (induction_type != vect_step_op_add)
9877     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9878                                              vec_stmt, slp_node, cost_vec);
9879
9880   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9881   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9882
9883   if (slp_node)
9884     ncopies = 1;
9885   else
9886     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9887   gcc_assert (ncopies >= 1);
9888
9889   /* FORNOW. These restrictions should be relaxed.  */
9890   if (nested_in_vect_loop_p (loop, stmt_info))
9891     {
9892       imm_use_iterator imm_iter;
9893       use_operand_p use_p;
9894       gimple *exit_phi;
9895       edge latch_e;
9896       tree loop_arg;
9897
9898       if (ncopies > 1)
9899         {
9900           if (dump_enabled_p ())
9901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9902                              "multiple types in nested loop.\n");
9903           return false;
9904         }
9905
9906       exit_phi = NULL;
9907       latch_e = loop_latch_edge (loop->inner);
9908       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9909       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9910         {
9911           gimple *use_stmt = USE_STMT (use_p);
9912           if (is_gimple_debug (use_stmt))
9913             continue;
9914
9915           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9916             {
9917               exit_phi = use_stmt;
9918               break;
9919             }
9920         }
9921       if (exit_phi)
9922         {
9923           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9924           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9925                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9926             {
9927               if (dump_enabled_p ())
9928                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9929                                  "inner-loop induction only used outside "
9930                                  "of the outer vectorized loop.\n");
9931               return false;
9932             }
9933         }
9934
9935       nested_in_vect_loop = true;
9936       iv_loop = loop->inner;
9937     }
9938   else
9939     iv_loop = loop;
9940   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9941
9942   if (slp_node && !nunits.is_constant ())
9943     {
9944       /* The current SLP code creates the step value element-by-element.  */
9945       if (dump_enabled_p ())
9946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9947                          "SLP induction not supported for variable-length"
9948                          " vectors.\n");
9949       return false;
9950     }
9951
9952   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9953     {
9954       if (dump_enabled_p ())
9955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9956                          "floating point induction vectorization disabled\n");
9957       return false;
9958     }
9959
9960   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9961   gcc_assert (step_expr != NULL_TREE);
9962   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9963
9964   /* Check for backend support of PLUS/MINUS_EXPR. */
9965   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9966       || !directly_supported_p (MINUS_EXPR, step_vectype))
9967     return false;
9968
9969   if (!vec_stmt) /* transformation not required.  */
9970     {
9971       unsigned inside_cost = 0, prologue_cost = 0;
9972       if (slp_node)
9973         {
9974           /* We eventually need to set a vector type on invariant
9975              arguments.  */
9976           unsigned j;
9977           slp_tree child;
9978           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9979             if (!vect_maybe_update_slp_op_vectype
9980                 (child, SLP_TREE_VECTYPE (slp_node)))
9981               {
9982                 if (dump_enabled_p ())
9983                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9984                                    "incompatible vector types for "
9985                                    "invariants\n");
9986                 return false;
9987               }
9988           /* loop cost for vec_loop.  */
9989           inside_cost
9990             = record_stmt_cost (cost_vec,
9991                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9992                                 vector_stmt, stmt_info, 0, vect_body);
9993           /* prologue cost for vec_init (if not nested) and step.  */
9994           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9995                                             scalar_to_vec,
9996                                             stmt_info, 0, vect_prologue);
9997         }
9998       else /* if (!slp_node) */
9999         {
10000           /* loop cost for vec_loop.  */
10001           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10002                                           stmt_info, 0, vect_body);
10003           /* prologue cost for vec_init and vec_step.  */
10004           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10005                                             stmt_info, 0, vect_prologue);
10006         }
10007       if (dump_enabled_p ())
10008         dump_printf_loc (MSG_NOTE, vect_location,
10009                          "vect_model_induction_cost: inside_cost = %d, "
10010                          "prologue_cost = %d .\n", inside_cost,
10011                          prologue_cost);
10012
10013       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10014       DUMP_VECT_SCOPE ("vectorizable_induction");
10015       return true;
10016     }
10017
10018   /* Transform.  */
10019
10020   /* Compute a vector variable, initialized with the first VF values of
10021      the induction variable.  E.g., for an iv with IV_PHI='X' and
10022      evolution S, for a vector of 4 units, we want to compute:
10023      [X, X + S, X + 2*S, X + 3*S].  */
10024
10025   if (dump_enabled_p ())
10026     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10027
10028   pe = loop_preheader_edge (iv_loop);
10029   /* Find the first insertion point in the BB.  */
10030   basic_block bb = gimple_bb (phi);
10031   si = gsi_after_labels (bb);
10032
10033   /* For SLP induction we have to generate several IVs as for example
10034      with group size 3 we need
10035        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10036        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10037   if (slp_node)
10038     {
10039       /* Enforced above.  */
10040       unsigned int const_nunits = nunits.to_constant ();
10041
10042       /* The initial values are vectorized, but any lanes > group_size
10043          need adjustment.  */
10044       slp_tree init_node
10045         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10046
10047       /* Gather steps.  Since we do not vectorize inductions as
10048          cycles we have to reconstruct the step from SCEV data.  */
10049       unsigned group_size = SLP_TREE_LANES (slp_node);
10050       tree *steps = XALLOCAVEC (tree, group_size);
10051       tree *inits = XALLOCAVEC (tree, group_size);
10052       stmt_vec_info phi_info;
10053       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10054         {
10055           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10056           if (!init_node)
10057             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10058                                            pe->dest_idx);
10059         }
10060
10061       /* Now generate the IVs.  */
10062       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10063       gcc_assert ((const_nunits * nvects) % group_size == 0);
10064       unsigned nivs;
10065       if (nested_in_vect_loop)
10066         nivs = nvects;
10067       else
10068         {
10069           /* Compute the number of distinct IVs we need.  First reduce
10070              group_size if it is a multiple of const_nunits so we get
10071              one IV for a group_size of 4 but const_nunits 2.  */
10072           unsigned group_sizep = group_size;
10073           if (group_sizep % const_nunits == 0)
10074             group_sizep = group_sizep / const_nunits;
10075           nivs = least_common_multiple (group_sizep,
10076                                         const_nunits) / const_nunits;
10077         }
10078       tree stept = TREE_TYPE (step_vectype);
10079       tree lupdate_mul = NULL_TREE;
10080       if (!nested_in_vect_loop)
10081         {
10082           /* The number of iterations covered in one vector iteration.  */
10083           unsigned lup_mul = (nvects * const_nunits) / group_size;
10084           lupdate_mul
10085             = build_vector_from_val (step_vectype,
10086                                      SCALAR_FLOAT_TYPE_P (stept)
10087                                      ? build_real_from_wide (stept, lup_mul,
10088                                                              UNSIGNED)
10089                                      : build_int_cstu (stept, lup_mul));
10090         }
10091       tree peel_mul = NULL_TREE;
10092       gimple_seq init_stmts = NULL;
10093       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10094         {
10095           if (SCALAR_FLOAT_TYPE_P (stept))
10096             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10097                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10098           else
10099             peel_mul = gimple_convert (&init_stmts, stept,
10100                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10101           peel_mul = gimple_build_vector_from_val (&init_stmts,
10102                                                    step_vectype, peel_mul);
10103         }
10104       unsigned ivn;
10105       auto_vec<tree> vec_steps;
10106       for (ivn = 0; ivn < nivs; ++ivn)
10107         {
10108           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10109           tree_vector_builder init_elts (vectype, const_nunits, 1);
10110           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10111           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10112             {
10113               /* The scalar steps of the IVs.  */
10114               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10115               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10116               step_elts.quick_push (elt);
10117               if (!init_node)
10118                 {
10119                   /* The scalar inits of the IVs if not vectorized.  */
10120                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10121                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10122                                                   TREE_TYPE (elt)))
10123                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10124                                         TREE_TYPE (vectype), elt);
10125                   init_elts.quick_push (elt);
10126                 }
10127               /* The number of steps to add to the initial values.  */
10128               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10129               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10130                                    ? build_real_from_wide (stept,
10131                                                            mul_elt, UNSIGNED)
10132                                    : build_int_cstu (stept, mul_elt));
10133             }
10134           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10135           vec_steps.safe_push (vec_step);
10136           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10137           if (peel_mul)
10138             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10139                                      step_mul, peel_mul);
10140           if (!init_node)
10141             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10142
10143           /* Create the induction-phi that defines the induction-operand.  */
10144           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10145                                             "vec_iv_");
10146           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10147           induc_def = PHI_RESULT (induction_phi);
10148
10149           /* Create the iv update inside the loop  */
10150           tree up = vec_step;
10151           if (lupdate_mul)
10152             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10153                                vec_step, lupdate_mul);
10154           gimple_seq stmts = NULL;
10155           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10156           vec_def = gimple_build (&stmts,
10157                                   PLUS_EXPR, step_vectype, vec_def, up);
10158           vec_def = gimple_convert (&stmts, vectype, vec_def);
10159           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10160           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10161                        UNKNOWN_LOCATION);
10162
10163           if (init_node)
10164             vec_init = vect_get_slp_vect_def (init_node, ivn);
10165           if (!nested_in_vect_loop
10166               && !integer_zerop (step_mul))
10167             {
10168               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10169               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10170                                  vec_step, step_mul);
10171               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10172                                       vec_def, up);
10173               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10174             }
10175
10176           /* Set the arguments of the phi node:  */
10177           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10178
10179           slp_node->push_vec_def (induction_phi);
10180         }
10181       if (!nested_in_vect_loop)
10182         {
10183           /* Fill up to the number of vectors we need for the whole group.  */
10184           nivs = least_common_multiple (group_size,
10185                                         const_nunits) / const_nunits;
10186           vec_steps.reserve (nivs-ivn);
10187           for (; ivn < nivs; ++ivn)
10188             {
10189               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10190               vec_steps.quick_push (vec_steps[0]);
10191             }
10192         }
10193
10194       /* Re-use IVs when we can.  We are generating further vector
10195          stmts by adding VF' * stride to the IVs generated above.  */
10196       if (ivn < nvects)
10197         {
10198           unsigned vfp
10199             = least_common_multiple (group_size, const_nunits) / group_size;
10200           tree lupdate_mul
10201             = build_vector_from_val (step_vectype,
10202                                      SCALAR_FLOAT_TYPE_P (stept)
10203                                      ? build_real_from_wide (stept,
10204                                                              vfp, UNSIGNED)
10205                                      : build_int_cstu (stept, vfp));
10206           for (; ivn < nvects; ++ivn)
10207             {
10208               gimple *iv
10209                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10210               tree def = gimple_get_lhs (iv);
10211               if (ivn < 2*nivs)
10212                 vec_steps[ivn - nivs]
10213                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10214                                   vec_steps[ivn - nivs], lupdate_mul);
10215               gimple_seq stmts = NULL;
10216               def = gimple_convert (&stmts, step_vectype, def);
10217               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10218                                   def, vec_steps[ivn % nivs]);
10219               def = gimple_convert (&stmts, vectype, def);
10220               if (gimple_code (iv) == GIMPLE_PHI)
10221                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10222               else
10223                 {
10224                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10225                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10226                 }
10227               slp_node->push_vec_def (def);
10228             }
10229         }
10230
10231       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10232       gcc_assert (!new_bb);
10233
10234       return true;
10235     }
10236
10237   init_expr = vect_phi_initial_value (phi);
10238
10239   gimple_seq stmts = NULL;
10240   if (!nested_in_vect_loop)
10241     {
10242       /* Convert the initial value to the IV update type.  */
10243       tree new_type = TREE_TYPE (step_expr);
10244       init_expr = gimple_convert (&stmts, new_type, init_expr);
10245
10246       /* If we are using the loop mask to "peel" for alignment then we need
10247          to adjust the start value here.  */
10248       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10249       if (skip_niters != NULL_TREE)
10250         {
10251           if (FLOAT_TYPE_P (vectype))
10252             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10253                                         skip_niters);
10254           else
10255             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10256           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10257                                          skip_niters, step_expr);
10258           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10259                                     init_expr, skip_step);
10260         }
10261     }
10262
10263   if (stmts)
10264     {
10265       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10266       gcc_assert (!new_bb);
10267     }
10268
10269   /* Create the vector that holds the initial_value of the induction.  */
10270   if (nested_in_vect_loop)
10271     {
10272       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10273          been created during vectorization of previous stmts.  We obtain it
10274          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10275       auto_vec<tree> vec_inits;
10276       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10277                                      init_expr, &vec_inits);
10278       vec_init = vec_inits[0];
10279       /* If the initial value is not of proper type, convert it.  */
10280       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10281         {
10282           new_stmt
10283             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10284                                                           vect_simple_var,
10285                                                           "vec_iv_"),
10286                                    VIEW_CONVERT_EXPR,
10287                                    build1 (VIEW_CONVERT_EXPR, vectype,
10288                                            vec_init));
10289           vec_init = gimple_assign_lhs (new_stmt);
10290           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10291                                                  new_stmt);
10292           gcc_assert (!new_bb);
10293         }
10294     }
10295   else
10296     {
10297       /* iv_loop is the loop to be vectorized. Create:
10298          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10299       stmts = NULL;
10300       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10301
10302       unsigned HOST_WIDE_INT const_nunits;
10303       if (nunits.is_constant (&const_nunits))
10304         {
10305           tree_vector_builder elts (step_vectype, const_nunits, 1);
10306           elts.quick_push (new_name);
10307           for (i = 1; i < const_nunits; i++)
10308             {
10309               /* Create: new_name_i = new_name + step_expr  */
10310               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10311                                        new_name, step_expr);
10312               elts.quick_push (new_name);
10313             }
10314           /* Create a vector from [new_name_0, new_name_1, ...,
10315              new_name_nunits-1]  */
10316           vec_init = gimple_build_vector (&stmts, &elts);
10317         }
10318       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10319         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10320         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10321                                  new_name, step_expr);
10322       else
10323         {
10324           /* Build:
10325                 [base, base, base, ...]
10326                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10327           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10328           gcc_assert (flag_associative_math);
10329           tree index = build_index_vector (step_vectype, 0, 1);
10330           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10331                                                         new_name);
10332           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10333                                                         step_expr);
10334           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10335           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10336                                    vec_init, step_vec);
10337           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10338                                    vec_init, base_vec);
10339         }
10340       vec_init = gimple_convert (&stmts, vectype, vec_init);
10341
10342       if (stmts)
10343         {
10344           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10345           gcc_assert (!new_bb);
10346         }
10347     }
10348
10349
10350   /* Create the vector that holds the step of the induction.  */
10351   gimple_stmt_iterator *step_iv_si = NULL;
10352   if (nested_in_vect_loop)
10353     /* iv_loop is nested in the loop to be vectorized. Generate:
10354        vec_step = [S, S, S, S]  */
10355     new_name = step_expr;
10356   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10357     {
10358       /* When we're using loop_len produced by SELEC_VL, the non-final
10359          iterations are not always processing VF elements.  So vectorize
10360          induction variable instead of
10361
10362            _21 = vect_vec_iv_.6_22 + { VF, ... };
10363
10364          We should generate:
10365
10366            _35 = .SELECT_VL (ivtmp_33, VF);
10367            vect_cst__22 = [vec_duplicate_expr] _35;
10368            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10369       gcc_assert (!slp_node);
10370       gimple_seq seq = NULL;
10371       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10372       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10373       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10374                                                  unshare_expr (len)),
10375                                    &seq, true, NULL_TREE);
10376       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10377                                step_expr);
10378       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10379       step_iv_si = &si;
10380     }
10381   else
10382     {
10383       /* iv_loop is the loop to be vectorized. Generate:
10384           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10385       gimple_seq seq = NULL;
10386       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10387         {
10388           expr = build_int_cst (integer_type_node, vf);
10389           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10390         }
10391       else
10392         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10393       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10394                                expr, step_expr);
10395       if (seq)
10396         {
10397           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10398           gcc_assert (!new_bb);
10399         }
10400     }
10401
10402   t = unshare_expr (new_name);
10403   gcc_assert (CONSTANT_CLASS_P (new_name)
10404               || TREE_CODE (new_name) == SSA_NAME);
10405   new_vec = build_vector_from_val (step_vectype, t);
10406   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10407                                new_vec, step_vectype, step_iv_si);
10408
10409
10410   /* Create the following def-use cycle:
10411      loop prolog:
10412          vec_init = ...
10413          vec_step = ...
10414      loop:
10415          vec_iv = PHI <vec_init, vec_loop>
10416          ...
10417          STMT
10418          ...
10419          vec_loop = vec_iv + vec_step;  */
10420
10421   /* Create the induction-phi that defines the induction-operand.  */
10422   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10423   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10424   induc_def = PHI_RESULT (induction_phi);
10425
10426   /* Create the iv update inside the loop  */
10427   stmts = NULL;
10428   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10429   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10430   vec_def = gimple_convert (&stmts, vectype, vec_def);
10431   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10432   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10433
10434   /* Set the arguments of the phi node:  */
10435   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10436   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10437                UNKNOWN_LOCATION);
10438
10439   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10440   *vec_stmt = induction_phi;
10441
10442   /* In case that vectorization factor (VF) is bigger than the number
10443      of elements that we can fit in a vectype (nunits), we have to generate
10444      more than one vector stmt - i.e - we need to "unroll" the
10445      vector stmt by a factor VF/nunits.  For more details see documentation
10446      in vectorizable_operation.  */
10447
10448   if (ncopies > 1)
10449     {
10450       gimple_seq seq = NULL;
10451       /* FORNOW. This restriction should be relaxed.  */
10452       gcc_assert (!nested_in_vect_loop);
10453       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10454       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10455
10456       /* Create the vector that holds the step of the induction.  */
10457       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10458         {
10459           expr = build_int_cst (integer_type_node, nunits);
10460           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10461         }
10462       else
10463         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10464       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10465                                expr, step_expr);
10466       if (seq)
10467         {
10468           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10469           gcc_assert (!new_bb);
10470         }
10471
10472       t = unshare_expr (new_name);
10473       gcc_assert (CONSTANT_CLASS_P (new_name)
10474                   || TREE_CODE (new_name) == SSA_NAME);
10475       new_vec = build_vector_from_val (step_vectype, t);
10476       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10477                                    new_vec, step_vectype, NULL);
10478
10479       vec_def = induc_def;
10480       for (i = 1; i < ncopies + 1; i++)
10481         {
10482           /* vec_i = vec_prev + vec_step  */
10483           gimple_seq stmts = NULL;
10484           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10485           vec_def = gimple_build (&stmts,
10486                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10487           vec_def = gimple_convert (&stmts, vectype, vec_def);
10488
10489           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10490           if (i < ncopies)
10491             {
10492               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10493               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10494             }
10495           else
10496             {
10497               /* vec_1 = vec_iv + (VF/n * S)
10498                  vec_2 = vec_1 + (VF/n * S)
10499                  ...
10500                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10501
10502                  vec_n is used as vec_loop to save the large step register and
10503                  related operations.  */
10504               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10505                            UNKNOWN_LOCATION);
10506             }
10507         }
10508     }
10509
10510   if (dump_enabled_p ())
10511     dump_printf_loc (MSG_NOTE, vect_location,
10512                      "transform induction: created def-use cycle: %G%G",
10513                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10514
10515   return true;
10516 }
10517
10518 /* Function vectorizable_live_operation.
10519
10520    STMT_INFO computes a value that is used outside the loop.  Check if
10521    it can be supported.  */
10522
10523 bool
10524 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10525                              slp_tree slp_node, slp_instance slp_node_instance,
10526                              int slp_index, bool vec_stmt_p,
10527                              stmt_vector_for_cost *cost_vec)
10528 {
10529   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10530   imm_use_iterator imm_iter;
10531   tree lhs, lhs_type, bitsize;
10532   tree vectype = (slp_node
10533                   ? SLP_TREE_VECTYPE (slp_node)
10534                   : STMT_VINFO_VECTYPE (stmt_info));
10535   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10536   int ncopies;
10537   gimple *use_stmt;
10538   auto_vec<tree> vec_oprnds;
10539   int vec_entry = 0;
10540   poly_uint64 vec_index = 0;
10541
10542   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10543
10544   /* If a stmt of a reduction is live, vectorize it via
10545      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10546      validity so just trigger the transform here.  */
10547   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10548     {
10549       if (!vec_stmt_p)
10550         return true;
10551       if (slp_node)
10552         {
10553           /* For reduction chains the meta-info is attached to
10554              the group leader.  */
10555           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10556             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10557           /* For SLP reductions we vectorize the epilogue for
10558              all involved stmts together.  */
10559           else if (slp_index != 0)
10560             return true;
10561         }
10562       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10563       gcc_assert (reduc_info->is_reduc_info);
10564       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10565           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10566         return true;
10567       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10568                                         slp_node_instance);
10569       return true;
10570     }
10571
10572   /* If STMT is not relevant and it is a simple assignment and its inputs are
10573      invariant then it can remain in place, unvectorized.  The original last
10574      scalar value that it computes will be used.  */
10575   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10576     {
10577       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10578       if (dump_enabled_p ())
10579         dump_printf_loc (MSG_NOTE, vect_location,
10580                          "statement is simple and uses invariant.  Leaving in "
10581                          "place.\n");
10582       return true;
10583     }
10584
10585   if (slp_node)
10586     ncopies = 1;
10587   else
10588     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10589
10590   if (slp_node)
10591     {
10592       gcc_assert (slp_index >= 0);
10593
10594       /* Get the last occurrence of the scalar index from the concatenation of
10595          all the slp vectors. Calculate which slp vector it is and the index
10596          within.  */
10597       int num_scalar = SLP_TREE_LANES (slp_node);
10598       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10599       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10600
10601       /* Calculate which vector contains the result, and which lane of
10602          that vector we need.  */
10603       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10604         {
10605           if (dump_enabled_p ())
10606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10607                              "Cannot determine which vector holds the"
10608                              " final result.\n");
10609           return false;
10610         }
10611     }
10612
10613   if (!vec_stmt_p)
10614     {
10615       /* No transformation required.  */
10616       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10617         {
10618           if (slp_node)
10619             {
10620               if (dump_enabled_p ())
10621                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10622                                  "can't operate on partial vectors "
10623                                  "because an SLP statement is live after "
10624                                  "the loop.\n");
10625               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10626             }
10627           else if (ncopies > 1)
10628             {
10629               if (dump_enabled_p ())
10630                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10631                                  "can't operate on partial vectors "
10632                                  "because ncopies is greater than 1.\n");
10633               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10634             }
10635           else
10636             {
10637               gcc_assert (ncopies == 1 && !slp_node);
10638               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10639                                                   OPTIMIZE_FOR_SPEED))
10640                 vect_record_loop_mask (loop_vinfo,
10641                                        &LOOP_VINFO_MASKS (loop_vinfo),
10642                                        1, vectype, NULL);
10643               else if (can_vec_extract_var_idx_p (
10644                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10645                 vect_record_loop_len (loop_vinfo,
10646                                       &LOOP_VINFO_LENS (loop_vinfo),
10647                                       1, vectype, 1);
10648               else
10649                 {
10650                   if (dump_enabled_p ())
10651                     dump_printf_loc (
10652                       MSG_MISSED_OPTIMIZATION, vect_location,
10653                       "can't operate on partial vectors "
10654                       "because the target doesn't support extract "
10655                       "last reduction.\n");
10656                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10657                 }
10658             }
10659         }
10660       /* ???  Enable for loop costing as well.  */
10661       if (!loop_vinfo)
10662         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10663                           0, vect_epilogue);
10664       return true;
10665     }
10666
10667   /* Use the lhs of the original scalar statement.  */
10668   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10669   if (dump_enabled_p ())
10670     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10671                      "stmt %G", stmt);
10672
10673   lhs = gimple_get_lhs (stmt);
10674   lhs_type = TREE_TYPE (lhs);
10675
10676   bitsize = vector_element_bits_tree (vectype);
10677
10678   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10679   tree vec_lhs, bitstart;
10680   gimple *vec_stmt;
10681   if (slp_node)
10682     {
10683       gcc_assert (!loop_vinfo
10684                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10685                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10686
10687       /* Get the correct slp vectorized stmt.  */
10688       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10689       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10690
10691       /* Get entry to use.  */
10692       bitstart = bitsize_int (vec_index);
10693       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10694     }
10695   else
10696     {
10697       /* For multiple copies, get the last copy.  */
10698       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10699       vec_lhs = gimple_get_lhs (vec_stmt);
10700
10701       /* Get the last lane in the vector.  */
10702       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10703     }
10704
10705   if (loop_vinfo)
10706     {
10707       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10708          requirement, insert one phi node for it.  It looks like:
10709            loop;
10710          BB:
10711            # lhs' = PHI <lhs>
10712          ==>
10713            loop;
10714          BB:
10715            # vec_lhs' = PHI <vec_lhs>
10716            new_tree = lane_extract <vec_lhs', ...>;
10717            lhs' = new_tree;  */
10718
10719       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10720       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10721       gcc_assert (single_pred_p (exit_bb));
10722
10723       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10724       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10725       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10726
10727       gimple_seq stmts = NULL;
10728       tree new_tree;
10729       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10730         {
10731           /* Emit:
10732
10733                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10734
10735              where VEC_LHS is the vectorized live-out result and MASK is
10736              the loop mask for the final iteration.  */
10737           gcc_assert (ncopies == 1 && !slp_node);
10738           gimple_seq tem = NULL;
10739           gimple_stmt_iterator gsi = gsi_last (tem);
10740           tree len
10741             = vect_get_loop_len (loop_vinfo, &gsi,
10742                                  &LOOP_VINFO_LENS (loop_vinfo),
10743                                  1, vectype, 0, 0);
10744
10745           /* BIAS - 1.  */
10746           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10747           tree bias_minus_one
10748             = int_const_binop (MINUS_EXPR,
10749                                build_int_cst (TREE_TYPE (len), biasval),
10750                                build_one_cst (TREE_TYPE (len)));
10751
10752           /* LAST_INDEX = LEN + (BIAS - 1).  */
10753           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10754                                           len, bias_minus_one);
10755
10756           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10757           tree scalar_res
10758             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10759                             vec_lhs_phi, last_index);
10760
10761           /* Convert the extracted vector element to the scalar type.  */
10762           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10763         }
10764       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10765         {
10766           /* Emit:
10767
10768                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10769
10770              where VEC_LHS is the vectorized live-out result and MASK is
10771              the loop mask for the final iteration.  */
10772           gcc_assert (ncopies == 1 && !slp_node);
10773           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10774           gimple_seq tem = NULL;
10775           gimple_stmt_iterator gsi = gsi_last (tem);
10776           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10777                                           &LOOP_VINFO_MASKS (loop_vinfo),
10778                                           1, vectype, 0);
10779           gimple_seq_add_seq (&stmts, tem);
10780           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10781                                           mask, vec_lhs_phi);
10782
10783           /* Convert the extracted vector element to the scalar type.  */
10784           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10785         }
10786       else
10787         {
10788           tree bftype = TREE_TYPE (vectype);
10789           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10790             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10791           new_tree = build3 (BIT_FIELD_REF, bftype,
10792                              vec_lhs_phi, bitsize, bitstart);
10793           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10794                                            &stmts, true, NULL_TREE);
10795         }
10796
10797       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10798       if (stmts)
10799         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10800
10801       /* Remove existing phis that copy from lhs and create copies
10802          from new_tree.  */
10803       gimple_stmt_iterator gsi;
10804       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10805         {
10806           gimple *phi = gsi_stmt (gsi);
10807           if ((gimple_phi_arg_def (phi, 0) == lhs))
10808             {
10809               remove_phi_node (&gsi, false);
10810               tree lhs_phi = gimple_phi_result (phi);
10811               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10812               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10813             }
10814           else
10815             gsi_next (&gsi);
10816         }
10817
10818       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10819       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10820         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10821     }
10822   else
10823     {
10824       /* For basic-block vectorization simply insert the lane-extraction.  */
10825       tree bftype = TREE_TYPE (vectype);
10826       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10827         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10828       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10829                               vec_lhs, bitsize, bitstart);
10830       gimple_seq stmts = NULL;
10831       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10832                                        &stmts, true, NULL_TREE);
10833       if (TREE_CODE (new_tree) == SSA_NAME
10834           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10835         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10836       if (is_a <gphi *> (vec_stmt))
10837         {
10838           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10839           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10840         }
10841       else
10842         {
10843           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10844           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10845         }
10846
10847       /* Replace use of lhs with newly computed result.  If the use stmt is a
10848          single arg PHI, just replace all uses of PHI result.  It's necessary
10849          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10850       use_operand_p use_p;
10851       stmt_vec_info use_stmt_info;
10852       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10853         if (!is_gimple_debug (use_stmt)
10854             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10855                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10856           {
10857             /* ???  This can happen when the live lane ends up being
10858                rooted in a vector construction code-generated by an
10859                external SLP node (and code-generation for that already
10860                happened).  See gcc.dg/vect/bb-slp-47.c.
10861                Doing this is what would happen if that vector CTOR
10862                were not code-generated yet so it is not too bad.
10863                ???  In fact we'd likely want to avoid this situation
10864                in the first place.  */
10865             if (TREE_CODE (new_tree) == SSA_NAME
10866                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10867                 && gimple_code (use_stmt) != GIMPLE_PHI
10868                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10869                                                 use_stmt))
10870               {
10871                 if (dump_enabled_p ())
10872                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10873                                    "Using original scalar computation for "
10874                                    "live lane because use preceeds vector "
10875                                    "def\n");
10876                 continue;
10877               }
10878             /* ???  It can also happen that we end up pulling a def into
10879                a loop where replacing out-of-loop uses would require
10880                a new LC SSA PHI node.  Retain the original scalar in
10881                those cases as well.  PR98064.  */
10882             if (TREE_CODE (new_tree) == SSA_NAME
10883                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10884                 && (gimple_bb (use_stmt)->loop_father
10885                     != gimple_bb (vec_stmt)->loop_father)
10886                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10887                                         gimple_bb (use_stmt)->loop_father))
10888               {
10889                 if (dump_enabled_p ())
10890                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10891                                    "Using original scalar computation for "
10892                                    "live lane because there is an out-of-loop "
10893                                    "definition for it\n");
10894                 continue;
10895               }
10896             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10897               SET_USE (use_p, new_tree);
10898             update_stmt (use_stmt);
10899           }
10900     }
10901
10902   return true;
10903 }
10904
10905 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10906
10907 static void
10908 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10909 {
10910   ssa_op_iter op_iter;
10911   imm_use_iterator imm_iter;
10912   def_operand_p def_p;
10913   gimple *ustmt;
10914
10915   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10916     {
10917       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10918         {
10919           basic_block bb;
10920
10921           if (!is_gimple_debug (ustmt))
10922             continue;
10923
10924           bb = gimple_bb (ustmt);
10925
10926           if (!flow_bb_inside_loop_p (loop, bb))
10927             {
10928               if (gimple_debug_bind_p (ustmt))
10929                 {
10930                   if (dump_enabled_p ())
10931                     dump_printf_loc (MSG_NOTE, vect_location,
10932                                      "killing debug use\n");
10933
10934                   gimple_debug_bind_reset_value (ustmt);
10935                   update_stmt (ustmt);
10936                 }
10937               else
10938                 gcc_unreachable ();
10939             }
10940         }
10941     }
10942 }
10943
10944 /* Given loop represented by LOOP_VINFO, return true if computation of
10945    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10946    otherwise.  */
10947
10948 static bool
10949 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10950 {
10951   /* Constant case.  */
10952   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10953     {
10954       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10955       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10956
10957       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10958       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10959       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10960         return true;
10961     }
10962
10963   widest_int max;
10964   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10965   /* Check the upper bound of loop niters.  */
10966   if (get_max_loop_iterations (loop, &max))
10967     {
10968       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10969       signop sgn = TYPE_SIGN (type);
10970       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10971       if (max < type_max)
10972         return true;
10973     }
10974   return false;
10975 }
10976
10977 /* Return a mask type with half the number of elements as OLD_TYPE,
10978    given that it should have mode NEW_MODE.  */
10979
10980 tree
10981 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10982 {
10983   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10984   return build_truth_vector_type_for_mode (nunits, new_mode);
10985 }
10986
10987 /* Return a mask type with twice as many elements as OLD_TYPE,
10988    given that it should have mode NEW_MODE.  */
10989
10990 tree
10991 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10992 {
10993   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10994   return build_truth_vector_type_for_mode (nunits, new_mode);
10995 }
10996
10997 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10998    contain a sequence of NVECTORS masks that each control a vector of type
10999    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11000    these vector masks with the vector version of SCALAR_MASK.  */
11001
11002 void
11003 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11004                        unsigned int nvectors, tree vectype, tree scalar_mask)
11005 {
11006   gcc_assert (nvectors != 0);
11007
11008   if (scalar_mask)
11009     {
11010       scalar_cond_masked_key cond (scalar_mask, nvectors);
11011       loop_vinfo->scalar_cond_masked_set.add (cond);
11012     }
11013
11014   masks->mask_set.add (std::make_pair (vectype, nvectors));
11015 }
11016
11017 /* Given a complete set of masks MASKS, extract mask number INDEX
11018    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11019    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11020
11021    See the comment above vec_loop_masks for more details about the mask
11022    arrangement.  */
11023
11024 tree
11025 vect_get_loop_mask (loop_vec_info loop_vinfo,
11026                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11027                     unsigned int nvectors, tree vectype, unsigned int index)
11028 {
11029   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11030       == vect_partial_vectors_while_ult)
11031     {
11032       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11033       tree mask_type = rgm->type;
11034
11035       /* Populate the rgroup's mask array, if this is the first time we've
11036          used it.  */
11037       if (rgm->controls.is_empty ())
11038         {
11039           rgm->controls.safe_grow_cleared (nvectors, true);
11040           for (unsigned int i = 0; i < nvectors; ++i)
11041             {
11042               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11043               /* Provide a dummy definition until the real one is available.  */
11044               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11045               rgm->controls[i] = mask;
11046             }
11047         }
11048
11049       tree mask = rgm->controls[index];
11050       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11051                     TYPE_VECTOR_SUBPARTS (vectype)))
11052         {
11053           /* A loop mask for data type X can be reused for data type Y
11054              if X has N times more elements than Y and if Y's elements
11055              are N times bigger than X's.  In this case each sequence
11056              of N elements in the loop mask will be all-zero or all-one.
11057              We can then view-convert the mask so that each sequence of
11058              N elements is replaced by a single element.  */
11059           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11060                                   TYPE_VECTOR_SUBPARTS (vectype)));
11061           gimple_seq seq = NULL;
11062           mask_type = truth_type_for (vectype);
11063           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11064           if (seq)
11065             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11066         }
11067       return mask;
11068     }
11069   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11070            == vect_partial_vectors_avx512)
11071     {
11072       /* The number of scalars per iteration and the number of vectors are
11073          both compile-time constants.  */
11074       unsigned int nscalars_per_iter
11075         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11076                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11077
11078       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11079
11080       /* The stored nV is dependent on the mask type produced.  */
11081       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11082                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11083                   == rgm->factor);
11084       nvectors = rgm->factor;
11085
11086       /* Populate the rgroup's mask array, if this is the first time we've
11087          used it.  */
11088       if (rgm->controls.is_empty ())
11089         {
11090           rgm->controls.safe_grow_cleared (nvectors, true);
11091           for (unsigned int i = 0; i < nvectors; ++i)
11092             {
11093               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11094               /* Provide a dummy definition until the real one is available.  */
11095               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11096               rgm->controls[i] = mask;
11097             }
11098         }
11099       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11100                     TYPE_VECTOR_SUBPARTS (vectype)))
11101         return rgm->controls[index];
11102
11103       /* Split the vector if needed.  Since we are dealing with integer mode
11104          masks with AVX512 we can operate on the integer representation
11105          performing the whole vector shifting.  */
11106       unsigned HOST_WIDE_INT factor;
11107       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11108                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11109       gcc_assert (ok);
11110       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11111       tree mask_type = truth_type_for (vectype);
11112       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11113       unsigned vi = index / factor;
11114       unsigned vpart = index % factor;
11115       tree vec = rgm->controls[vi];
11116       gimple_seq seq = NULL;
11117       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11118                           lang_hooks.types.type_for_mode
11119                                 (TYPE_MODE (rgm->type), 1), vec);
11120       /* For integer mode masks simply shift the right bits into position.  */
11121       if (vpart != 0)
11122         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11123                             build_int_cst (integer_type_node,
11124                                            (TYPE_VECTOR_SUBPARTS (vectype)
11125                                             * vpart)));
11126       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11127                                     (TYPE_MODE (mask_type), 1), vec);
11128       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11129       if (seq)
11130         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11131       return vec;
11132     }
11133   else
11134     gcc_unreachable ();
11135 }
11136
11137 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11138    lengths for controlling an operation on VECTYPE.  The operation splits
11139    each element of VECTYPE into FACTOR separate subelements, measuring the
11140    length as a number of these subelements.  */
11141
11142 void
11143 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11144                       unsigned int nvectors, tree vectype, unsigned int factor)
11145 {
11146   gcc_assert (nvectors != 0);
11147   if (lens->length () < nvectors)
11148     lens->safe_grow_cleared (nvectors, true);
11149   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11150
11151   /* The number of scalars per iteration, scalar occupied bytes and
11152      the number of vectors are both compile-time constants.  */
11153   unsigned int nscalars_per_iter
11154     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11155                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11156
11157   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11158     {
11159       /* For now, we only support cases in which all loads and stores fall back
11160          to VnQI or none do.  */
11161       gcc_assert (!rgl->max_nscalars_per_iter
11162                   || (rgl->factor == 1 && factor == 1)
11163                   || (rgl->max_nscalars_per_iter * rgl->factor
11164                       == nscalars_per_iter * factor));
11165       rgl->max_nscalars_per_iter = nscalars_per_iter;
11166       rgl->type = vectype;
11167       rgl->factor = factor;
11168     }
11169 }
11170
11171 /* Given a complete set of lengths LENS, extract length number INDEX
11172    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11173    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11174    multipled by the number of elements that should be processed.
11175    Insert any set-up statements before GSI.  */
11176
11177 tree
11178 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11179                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11180                    unsigned int index, unsigned int factor)
11181 {
11182   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11183   bool use_bias_adjusted_len =
11184     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11185
11186   /* Populate the rgroup's len array, if this is the first time we've
11187      used it.  */
11188   if (rgl->controls.is_empty ())
11189     {
11190       rgl->controls.safe_grow_cleared (nvectors, true);
11191       for (unsigned int i = 0; i < nvectors; ++i)
11192         {
11193           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11194           gcc_assert (len_type != NULL_TREE);
11195
11196           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11197
11198           /* Provide a dummy definition until the real one is available.  */
11199           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11200           rgl->controls[i] = len;
11201
11202           if (use_bias_adjusted_len)
11203             {
11204               gcc_assert (i == 0);
11205               tree adjusted_len =
11206                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11207               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11208               rgl->bias_adjusted_ctrl = adjusted_len;
11209             }
11210         }
11211     }
11212
11213   if (use_bias_adjusted_len)
11214     return rgl->bias_adjusted_ctrl;
11215
11216   tree loop_len = rgl->controls[index];
11217   if (rgl->factor == 1 && factor == 1)
11218     {
11219       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11220       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11221       if (maybe_ne (nunits1, nunits2))
11222         {
11223           /* A loop len for data type X can be reused for data type Y
11224              if X has N times more elements than Y and if Y's elements
11225              are N times bigger than X's.  */
11226           gcc_assert (multiple_p (nunits1, nunits2));
11227           factor = exact_div (nunits1, nunits2).to_constant ();
11228           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11229           gimple_seq seq = NULL;
11230           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11231                                    build_int_cst (iv_type, factor));
11232           if (seq)
11233             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11234         }
11235     }
11236   return loop_len;
11237 }
11238
11239 /* Scale profiling counters by estimation for LOOP which is vectorized
11240    by factor VF.
11241    If FLAT is true, the loop we started with had unrealistically flat
11242    profile.  */
11243
11244 static void
11245 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11246 {
11247   /* For flat profiles do not scale down proportionally by VF and only
11248      cap by known iteration count bounds.  */
11249   if (flat)
11250     {
11251       if (dump_file && (dump_flags & TDF_DETAILS))
11252         fprintf (dump_file,
11253                  "Vectorized loop profile seems flat; not scaling iteration "
11254                  "count down by the vectorization factor %i\n", vf);
11255       scale_loop_profile (loop, profile_probability::always (),
11256                           get_likely_max_loop_iterations_int (loop));
11257       return;
11258     }
11259   /* Loop body executes VF fewer times and exit increases VF times.  */
11260   profile_count entry_count = loop_preheader_edge (loop)->count ();
11261
11262   /* If we have unreliable loop profile avoid dropping entry
11263      count bellow header count.  This can happen since loops
11264      has unrealistically low trip counts.  */
11265   while (vf > 1
11266          && loop->header->count > entry_count
11267          && loop->header->count < entry_count * vf)
11268     {
11269       if (dump_file && (dump_flags & TDF_DETAILS))
11270         fprintf (dump_file,
11271                  "Vectorization factor %i seems too large for profile "
11272                  "prevoiusly believed to be consistent; reducing.\n", vf);
11273       vf /= 2;
11274     }
11275
11276   if (entry_count.nonzero_p ())
11277     set_edge_probability_and_rescale_others
11278             (exit_e,
11279              entry_count.probability_in (loop->header->count / vf));
11280   /* Avoid producing very large exit probability when we do not have
11281      sensible profile.  */
11282   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11283     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11284   loop->latch->count = single_pred_edge (loop->latch)->count ();
11285
11286   scale_loop_profile (loop, profile_probability::always () / vf,
11287                       get_likely_max_loop_iterations_int (loop));
11288 }
11289
11290 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11291    latch edge values originally defined by it.  */
11292
11293 static void
11294 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11295                                      stmt_vec_info def_stmt_info)
11296 {
11297   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11298   if (!def || TREE_CODE (def) != SSA_NAME)
11299     return;
11300   stmt_vec_info phi_info;
11301   imm_use_iterator iter;
11302   use_operand_p use_p;
11303   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11304     {
11305       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11306       if (!phi)
11307         continue;
11308       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11309             && (phi_info = loop_vinfo->lookup_stmt (phi))
11310             && STMT_VINFO_RELEVANT_P (phi_info)))
11311         continue;
11312       loop_p loop = gimple_bb (phi)->loop_father;
11313       edge e = loop_latch_edge (loop);
11314       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11315         continue;
11316
11317       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11318           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11319           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11320         {
11321           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11322           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11323           gcc_assert (phi_defs.length () == latch_defs.length ());
11324           for (unsigned i = 0; i < phi_defs.length (); ++i)
11325             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11326                          gimple_get_lhs (latch_defs[i]), e,
11327                          gimple_phi_arg_location (phi, e->dest_idx));
11328         }
11329       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11330         {
11331           /* For first order recurrences we have to update both uses of
11332              the latch definition, the one in the PHI node and the one
11333              in the generated VEC_PERM_EXPR.  */
11334           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11335           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11336           gcc_assert (phi_defs.length () == latch_defs.length ());
11337           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11338           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11339           for (unsigned i = 0; i < phi_defs.length (); ++i)
11340             {
11341               gassign *perm = as_a <gassign *> (phi_defs[i]);
11342               if (i > 0)
11343                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11344               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11345               update_stmt (perm);
11346             }
11347           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11348                        gimple_phi_arg_location (phi, e->dest_idx));
11349         }
11350     }
11351 }
11352
11353 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11354    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11355    stmt_vec_info.  */
11356
11357 static bool
11358 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11359                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11360 {
11361   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11362   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11363
11364   if (dump_enabled_p ())
11365     dump_printf_loc (MSG_NOTE, vect_location,
11366                      "------>vectorizing statement: %G", stmt_info->stmt);
11367
11368   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11369     vect_loop_kill_debug_uses (loop, stmt_info);
11370
11371   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11372       && !STMT_VINFO_LIVE_P (stmt_info))
11373     {
11374       if (is_gimple_call (stmt_info->stmt)
11375           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11376         {
11377           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11378           *seen_store = stmt_info;
11379           return false;
11380         }
11381       return false;
11382     }
11383
11384   if (STMT_VINFO_VECTYPE (stmt_info))
11385     {
11386       poly_uint64 nunits
11387         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11388       if (!STMT_SLP_TYPE (stmt_info)
11389           && maybe_ne (nunits, vf)
11390           && dump_enabled_p ())
11391         /* For SLP VF is set according to unrolling factor, and not
11392            to vector size, hence for SLP this print is not valid.  */
11393         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11394     }
11395
11396   /* Pure SLP statements have already been vectorized.  We still need
11397      to apply loop vectorization to hybrid SLP statements.  */
11398   if (PURE_SLP_STMT (stmt_info))
11399     return false;
11400
11401   if (dump_enabled_p ())
11402     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11403
11404   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11405     *seen_store = stmt_info;
11406
11407   return true;
11408 }
11409
11410 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11411    in the hash_map with its corresponding values.  */
11412
11413 static tree
11414 find_in_mapping (tree t, void *context)
11415 {
11416   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11417
11418   tree *value = mapping->get (t);
11419   return value ? *value : t;
11420 }
11421
11422 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11423    original loop that has now been vectorized.
11424
11425    The inits of the data_references need to be advanced with the number of
11426    iterations of the main loop.  This has been computed in vect_do_peeling and
11427    is stored in parameter ADVANCE.  We first restore the data_references
11428    initial offset with the values recored in ORIG_DRS_INIT.
11429
11430    Since the loop_vec_info of this EPILOGUE was constructed for the original
11431    loop, its stmt_vec_infos all point to the original statements.  These need
11432    to be updated to point to their corresponding copies as well as the SSA_NAMES
11433    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11434
11435    The data_reference's connections also need to be updated.  Their
11436    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11437    stmt_vec_infos, their statements need to point to their corresponding copy,
11438    if they are gather loads or scatter stores then their reference needs to be
11439    updated to point to its corresponding copy and finally we set
11440    'base_misaligned' to false as we have already peeled for alignment in the
11441    prologue of the main loop.  */
11442
11443 static void
11444 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11445 {
11446   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11447   auto_vec<gimple *> stmt_worklist;
11448   hash_map<tree,tree> mapping;
11449   gimple *orig_stmt, *new_stmt;
11450   gimple_stmt_iterator epilogue_gsi;
11451   gphi_iterator epilogue_phi_gsi;
11452   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11453   basic_block *epilogue_bbs = get_loop_body (epilogue);
11454   unsigned i;
11455
11456   free (LOOP_VINFO_BBS (epilogue_vinfo));
11457   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11458
11459   /* Advance data_reference's with the number of iterations of the previous
11460      loop and its prologue.  */
11461   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11462
11463
11464   /* The EPILOGUE loop is a copy of the original loop so they share the same
11465      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11466      point to the copied statements.  We also create a mapping of all LHS' in
11467      the original loop and all the LHS' in the EPILOGUE and create worklists to
11468      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11469   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11470     {
11471       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11472            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11473         {
11474           new_stmt = epilogue_phi_gsi.phi ();
11475
11476           gcc_assert (gimple_uid (new_stmt) > 0);
11477           stmt_vinfo
11478             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11479
11480           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11481           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11482
11483           mapping.put (gimple_phi_result (orig_stmt),
11484                        gimple_phi_result (new_stmt));
11485           /* PHI nodes can not have patterns or related statements.  */
11486           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11487                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11488         }
11489
11490       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11491            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11492         {
11493           new_stmt = gsi_stmt (epilogue_gsi);
11494           if (is_gimple_debug (new_stmt))
11495             continue;
11496
11497           gcc_assert (gimple_uid (new_stmt) > 0);
11498           stmt_vinfo
11499             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11500
11501           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11502           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11503
11504           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11505             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11506
11507           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11508             {
11509               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11510               for (gimple_stmt_iterator gsi = gsi_start (seq);
11511                    !gsi_end_p (gsi); gsi_next (&gsi))
11512                 stmt_worklist.safe_push (gsi_stmt (gsi));
11513             }
11514
11515           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11516           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11517             {
11518               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11519               stmt_worklist.safe_push (stmt);
11520               /* Set BB such that the assert in
11521                 'get_initial_def_for_reduction' is able to determine that
11522                 the BB of the related stmt is inside this loop.  */
11523               gimple_set_bb (stmt,
11524                              gimple_bb (new_stmt));
11525               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11526               gcc_assert (related_vinfo == NULL
11527                           || related_vinfo == stmt_vinfo);
11528             }
11529         }
11530     }
11531
11532   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11533      using the original main loop and thus need to be updated to refer to the
11534      cloned variables used in the epilogue.  */
11535   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11536     {
11537       gimple *stmt = stmt_worklist[i];
11538       tree *new_op;
11539
11540       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11541         {
11542           tree op = gimple_op (stmt, j);
11543           if ((new_op = mapping.get(op)))
11544             gimple_set_op (stmt, j, *new_op);
11545           else
11546             {
11547               /* PR92429: The last argument of simplify_replace_tree disables
11548                  folding when replacing arguments.  This is required as
11549                  otherwise you might end up with different statements than the
11550                  ones analyzed in vect_loop_analyze, leading to different
11551                  vectorization.  */
11552               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11553                                           &find_in_mapping, &mapping, false);
11554               gimple_set_op (stmt, j, op);
11555             }
11556         }
11557     }
11558
11559   struct data_reference *dr;
11560   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11561   FOR_EACH_VEC_ELT (datarefs, i, dr)
11562     {
11563       orig_stmt = DR_STMT (dr);
11564       gcc_assert (gimple_uid (orig_stmt) > 0);
11565       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11566       /* Data references for gather loads and scatter stores do not use the
11567          updated offset we set using ADVANCE.  Instead we have to make sure the
11568          reference in the data references point to the corresponding copy of
11569          the original in the epilogue.  Make sure to update both
11570          gather/scatters recognized by dataref analysis and also other
11571          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11572       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11573       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11574           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11575         {
11576           DR_REF (dr)
11577             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11578                                      &find_in_mapping, &mapping);
11579           DR_BASE_ADDRESS (dr)
11580             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11581                                      &find_in_mapping, &mapping);
11582         }
11583       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11584       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11585       /* The vector size of the epilogue is smaller than that of the main loop
11586          so the alignment is either the same or lower. This means the dr will
11587          thus by definition be aligned.  */
11588       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11589     }
11590
11591   epilogue_vinfo->shared->datarefs_copy.release ();
11592   epilogue_vinfo->shared->save_datarefs ();
11593 }
11594
11595 /* Function vect_transform_loop.
11596
11597    The analysis phase has determined that the loop is vectorizable.
11598    Vectorize the loop - created vectorized stmts to replace the scalar
11599    stmts in the loop, and update the loop exit condition.
11600    Returns scalar epilogue loop if any.  */
11601
11602 class loop *
11603 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11604 {
11605   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11606   class loop *epilogue = NULL;
11607   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11608   int nbbs = loop->num_nodes;
11609   int i;
11610   tree niters_vector = NULL_TREE;
11611   tree step_vector = NULL_TREE;
11612   tree niters_vector_mult_vf = NULL_TREE;
11613   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11614   unsigned int lowest_vf = constant_lower_bound (vf);
11615   gimple *stmt;
11616   bool check_profitability = false;
11617   unsigned int th;
11618   bool flat = maybe_flat_loop_profile (loop);
11619
11620   DUMP_VECT_SCOPE ("vec_transform_loop");
11621
11622   loop_vinfo->shared->check_datarefs ();
11623
11624   /* Use the more conservative vectorization threshold.  If the number
11625      of iterations is constant assume the cost check has been performed
11626      by our caller.  If the threshold makes all loops profitable that
11627      run at least the (estimated) vectorization factor number of times
11628      checking is pointless, too.  */
11629   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11630   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11631     {
11632       if (dump_enabled_p ())
11633         dump_printf_loc (MSG_NOTE, vect_location,
11634                          "Profitability threshold is %d loop iterations.\n",
11635                          th);
11636       check_profitability = true;
11637     }
11638
11639   /* Make sure there exists a single-predecessor exit bb.  Do this before
11640      versioning.   */
11641   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11642   if (! single_pred_p (e->dest))
11643     {
11644       split_loop_exit_edge (e, true);
11645       if (dump_enabled_p ())
11646         dump_printf (MSG_NOTE, "split exit edge\n");
11647     }
11648
11649   /* Version the loop first, if required, so the profitability check
11650      comes first.  */
11651
11652   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11653     {
11654       class loop *sloop
11655         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11656       sloop->force_vectorize = false;
11657       check_profitability = false;
11658     }
11659
11660   /* Make sure there exists a single-predecessor exit bb also on the
11661      scalar loop copy.  Do this after versioning but before peeling
11662      so CFG structure is fine for both scalar and if-converted loop
11663      to make slpeel_duplicate_current_defs_from_edges face matched
11664      loop closed PHI nodes on the exit.  */
11665   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11666     {
11667       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11668       if (! single_pred_p (e->dest))
11669         {
11670           split_loop_exit_edge (e, true);
11671           if (dump_enabled_p ())
11672             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11673         }
11674     }
11675
11676   tree niters = vect_build_loop_niters (loop_vinfo);
11677   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11678   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11679   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11680   tree advance;
11681   drs_init_vec orig_drs_init;
11682
11683   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11684                               &step_vector, &niters_vector_mult_vf, th,
11685                               check_profitability, niters_no_overflow,
11686                               &advance);
11687   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11688       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11689     {
11690       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11691          block after loop exit.  We need to scale all that.  */
11692       basic_block preheader
11693         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11694       preheader->count
11695         = preheader->count.apply_probability
11696               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11697       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11698                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11699       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11700         = preheader->count;
11701     }
11702
11703   if (niters_vector == NULL_TREE)
11704     {
11705       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11706           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11707           && known_eq (lowest_vf, vf))
11708         {
11709           niters_vector
11710             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11711                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11712           step_vector = build_one_cst (TREE_TYPE (niters));
11713         }
11714       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11715         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11716                                      &step_vector, niters_no_overflow);
11717       else
11718         /* vect_do_peeling subtracted the number of peeled prologue
11719            iterations from LOOP_VINFO_NITERS.  */
11720         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11721                                      &niters_vector, &step_vector,
11722                                      niters_no_overflow);
11723     }
11724
11725   /* 1) Make sure the loop header has exactly two entries
11726      2) Make sure we have a preheader basic block.  */
11727
11728   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11729
11730   split_edge (loop_preheader_edge (loop));
11731
11732   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11733     /* This will deal with any possible peeling.  */
11734     vect_prepare_for_masked_peels (loop_vinfo);
11735
11736   /* Schedule the SLP instances first, then handle loop vectorization
11737      below.  */
11738   if (!loop_vinfo->slp_instances.is_empty ())
11739     {
11740       DUMP_VECT_SCOPE ("scheduling SLP instances");
11741       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11742     }
11743
11744   /* FORNOW: the vectorizer supports only loops which body consist
11745      of one basic block (header + empty latch). When the vectorizer will
11746      support more involved loop forms, the order by which the BBs are
11747      traversed need to be reconsidered.  */
11748
11749   for (i = 0; i < nbbs; i++)
11750     {
11751       basic_block bb = bbs[i];
11752       stmt_vec_info stmt_info;
11753
11754       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11755            gsi_next (&si))
11756         {
11757           gphi *phi = si.phi ();
11758           if (dump_enabled_p ())
11759             dump_printf_loc (MSG_NOTE, vect_location,
11760                              "------>vectorizing phi: %G", (gimple *) phi);
11761           stmt_info = loop_vinfo->lookup_stmt (phi);
11762           if (!stmt_info)
11763             continue;
11764
11765           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11766             vect_loop_kill_debug_uses (loop, stmt_info);
11767
11768           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11769               && !STMT_VINFO_LIVE_P (stmt_info))
11770             continue;
11771
11772           if (STMT_VINFO_VECTYPE (stmt_info)
11773               && (maybe_ne
11774                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11775               && dump_enabled_p ())
11776             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11777
11778           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11779                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11780                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11781                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11782                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11783                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11784               && ! PURE_SLP_STMT (stmt_info))
11785             {
11786               if (dump_enabled_p ())
11787                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11788               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11789             }
11790         }
11791
11792       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11793            gsi_next (&si))
11794         {
11795           gphi *phi = si.phi ();
11796           stmt_info = loop_vinfo->lookup_stmt (phi);
11797           if (!stmt_info)
11798             continue;
11799
11800           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11801               && !STMT_VINFO_LIVE_P (stmt_info))
11802             continue;
11803
11804           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11805                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11806                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11807                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11808                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11809                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11810               && ! PURE_SLP_STMT (stmt_info))
11811             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11812         }
11813
11814       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11815            !gsi_end_p (si);)
11816         {
11817           stmt = gsi_stmt (si);
11818           /* During vectorization remove existing clobber stmts.  */
11819           if (gimple_clobber_p (stmt))
11820             {
11821               unlink_stmt_vdef (stmt);
11822               gsi_remove (&si, true);
11823               release_defs (stmt);
11824             }
11825           else
11826             {
11827               /* Ignore vector stmts created in the outer loop.  */
11828               stmt_info = loop_vinfo->lookup_stmt (stmt);
11829
11830               /* vector stmts created in the outer-loop during vectorization of
11831                  stmts in an inner-loop may not have a stmt_info, and do not
11832                  need to be vectorized.  */
11833               stmt_vec_info seen_store = NULL;
11834               if (stmt_info)
11835                 {
11836                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11837                     {
11838                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11839                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11840                            !gsi_end_p (subsi); gsi_next (&subsi))
11841                         {
11842                           stmt_vec_info pat_stmt_info
11843                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11844                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11845                                                     &si, &seen_store);
11846                         }
11847                       stmt_vec_info pat_stmt_info
11848                         = STMT_VINFO_RELATED_STMT (stmt_info);
11849                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11850                                                     &si, &seen_store))
11851                         maybe_set_vectorized_backedge_value (loop_vinfo,
11852                                                              pat_stmt_info);
11853                     }
11854                   else
11855                     {
11856                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11857                                                     &seen_store))
11858                         maybe_set_vectorized_backedge_value (loop_vinfo,
11859                                                              stmt_info);
11860                     }
11861                 }
11862               gsi_next (&si);
11863               if (seen_store)
11864                 {
11865                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11866                     /* Interleaving.  If IS_STORE is TRUE, the
11867                        vectorization of the interleaving chain was
11868                        completed - free all the stores in the chain.  */
11869                     vect_remove_stores (loop_vinfo,
11870                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11871                   else
11872                     /* Free the attached stmt_vec_info and remove the stmt.  */
11873                     loop_vinfo->remove_stmt (stmt_info);
11874                 }
11875             }
11876         }
11877
11878       /* Stub out scalar statements that must not survive vectorization.
11879          Doing this here helps with grouped statements, or statements that
11880          are involved in patterns.  */
11881       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11882            !gsi_end_p (gsi); gsi_next (&gsi))
11883         {
11884           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11885           if (!call || !gimple_call_internal_p (call))
11886             continue;
11887           internal_fn ifn = gimple_call_internal_fn (call);
11888           if (ifn == IFN_MASK_LOAD)
11889             {
11890               tree lhs = gimple_get_lhs (call);
11891               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11892                 {
11893                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11894                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11895                   gsi_replace (&gsi, new_stmt, true);
11896                 }
11897             }
11898           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11899             {
11900               tree lhs = gimple_get_lhs (call);
11901               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11902                 {
11903                   tree else_arg
11904                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11905                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11906                   gsi_replace (&gsi, new_stmt, true);
11907                 }
11908             }
11909         }
11910     }                           /* BBs in loop */
11911
11912   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11913      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11914   if (integer_onep (step_vector))
11915     niters_no_overflow = true;
11916   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11917                            niters_vector, step_vector, niters_vector_mult_vf,
11918                            !niters_no_overflow);
11919
11920   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11921
11922   /* True if the final iteration might not handle a full vector's
11923      worth of scalar iterations.  */
11924   bool final_iter_may_be_partial
11925     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11926   /* The minimum number of iterations performed by the epilogue.  This
11927      is 1 when peeling for gaps because we always need a final scalar
11928      iteration.  */
11929   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11930   /* +1 to convert latch counts to loop iteration counts,
11931      -min_epilogue_iters to remove iterations that cannot be performed
11932        by the vector code.  */
11933   int bias_for_lowest = 1 - min_epilogue_iters;
11934   int bias_for_assumed = bias_for_lowest;
11935   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11936   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11937     {
11938       /* When the amount of peeling is known at compile time, the first
11939          iteration will have exactly alignment_npeels active elements.
11940          In the worst case it will have at least one.  */
11941       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11942       bias_for_lowest += lowest_vf - min_first_active;
11943       bias_for_assumed += assumed_vf - min_first_active;
11944     }
11945   /* In these calculations the "- 1" converts loop iteration counts
11946      back to latch counts.  */
11947   if (loop->any_upper_bound)
11948     {
11949       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11950       loop->nb_iterations_upper_bound
11951         = (final_iter_may_be_partial
11952            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11953                             lowest_vf) - 1
11954            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11955                              lowest_vf) - 1);
11956       if (main_vinfo
11957           /* Both peeling for alignment and peeling for gaps can end up
11958              with the scalar epilogue running for more than VF-1 iterations.  */
11959           && !main_vinfo->peeling_for_alignment
11960           && !main_vinfo->peeling_for_gaps)
11961         {
11962           unsigned int bound;
11963           poly_uint64 main_iters
11964             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11965                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11966           main_iters
11967             = upper_bound (main_iters,
11968                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11969           if (can_div_away_from_zero_p (main_iters,
11970                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11971                                         &bound))
11972             loop->nb_iterations_upper_bound
11973               = wi::umin ((bound_wide_int) (bound - 1),
11974                           loop->nb_iterations_upper_bound);
11975       }
11976   }
11977   if (loop->any_likely_upper_bound)
11978     loop->nb_iterations_likely_upper_bound
11979       = (final_iter_may_be_partial
11980          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11981                           + bias_for_lowest, lowest_vf) - 1
11982          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11983                            + bias_for_lowest, lowest_vf) - 1);
11984   if (loop->any_estimate)
11985     loop->nb_iterations_estimate
11986       = (final_iter_may_be_partial
11987          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11988                           assumed_vf) - 1
11989          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11990                            assumed_vf) - 1);
11991   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11992                                assumed_vf, flat);
11993
11994   if (dump_enabled_p ())
11995     {
11996       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11997         {
11998           dump_printf_loc (MSG_NOTE, vect_location,
11999                            "LOOP VECTORIZED\n");
12000           if (loop->inner)
12001             dump_printf_loc (MSG_NOTE, vect_location,
12002                              "OUTER LOOP VECTORIZED\n");
12003           dump_printf (MSG_NOTE, "\n");
12004         }
12005       else
12006         dump_printf_loc (MSG_NOTE, vect_location,
12007                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12008                          GET_MODE_NAME (loop_vinfo->vector_mode));
12009     }
12010
12011   /* Loops vectorized with a variable factor won't benefit from
12012      unrolling/peeling.  */
12013   if (!vf.is_constant ())
12014     {
12015       loop->unroll = 1;
12016       if (dump_enabled_p ())
12017         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12018                          " variable-length vectorization factor\n");
12019     }
12020   /* Free SLP instances here because otherwise stmt reference counting
12021      won't work.  */
12022   slp_instance instance;
12023   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12024     vect_free_slp_instance (instance);
12025   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12026   /* Clear-up safelen field since its value is invalid after vectorization
12027      since vectorized loop can have loop-carried dependencies.  */
12028   loop->safelen = 0;
12029
12030   if (epilogue)
12031     {
12032       update_epilogue_loop_vinfo (epilogue, advance);
12033
12034       epilogue->simduid = loop->simduid;
12035       epilogue->force_vectorize = loop->force_vectorize;
12036       epilogue->dont_vectorize = false;
12037     }
12038
12039   return epilogue;
12040 }
12041
12042 /* The code below is trying to perform simple optimization - revert
12043    if-conversion for masked stores, i.e. if the mask of a store is zero
12044    do not perform it and all stored value producers also if possible.
12045    For example,
12046      for (i=0; i<n; i++)
12047        if (c[i])
12048         {
12049           p1[i] += 1;
12050           p2[i] = p3[i] +2;
12051         }
12052    this transformation will produce the following semi-hammock:
12053
12054    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12055      {
12056        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12057        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12058        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12059        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12060        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12061        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12062      }
12063 */
12064
12065 void
12066 optimize_mask_stores (class loop *loop)
12067 {
12068   basic_block *bbs = get_loop_body (loop);
12069   unsigned nbbs = loop->num_nodes;
12070   unsigned i;
12071   basic_block bb;
12072   class loop *bb_loop;
12073   gimple_stmt_iterator gsi;
12074   gimple *stmt;
12075   auto_vec<gimple *> worklist;
12076   auto_purge_vect_location sentinel;
12077
12078   vect_location = find_loop_location (loop);
12079   /* Pick up all masked stores in loop if any.  */
12080   for (i = 0; i < nbbs; i++)
12081     {
12082       bb = bbs[i];
12083       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12084            gsi_next (&gsi))
12085         {
12086           stmt = gsi_stmt (gsi);
12087           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12088             worklist.safe_push (stmt);
12089         }
12090     }
12091
12092   free (bbs);
12093   if (worklist.is_empty ())
12094     return;
12095
12096   /* Loop has masked stores.  */
12097   while (!worklist.is_empty ())
12098     {
12099       gimple *last, *last_store;
12100       edge e, efalse;
12101       tree mask;
12102       basic_block store_bb, join_bb;
12103       gimple_stmt_iterator gsi_to;
12104       tree vdef, new_vdef;
12105       gphi *phi;
12106       tree vectype;
12107       tree zero;
12108
12109       last = worklist.pop ();
12110       mask = gimple_call_arg (last, 2);
12111       bb = gimple_bb (last);
12112       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12113          the same loop as if_bb.  It could be different to LOOP when two
12114          level loop-nest is vectorized and mask_store belongs to the inner
12115          one.  */
12116       e = split_block (bb, last);
12117       bb_loop = bb->loop_father;
12118       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12119       join_bb = e->dest;
12120       store_bb = create_empty_bb (bb);
12121       add_bb_to_loop (store_bb, bb_loop);
12122       e->flags = EDGE_TRUE_VALUE;
12123       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12124       /* Put STORE_BB to likely part.  */
12125       efalse->probability = profile_probability::likely ();
12126       e->probability = efalse->probability.invert ();
12127       store_bb->count = efalse->count ();
12128       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12129       if (dom_info_available_p (CDI_DOMINATORS))
12130         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12131       if (dump_enabled_p ())
12132         dump_printf_loc (MSG_NOTE, vect_location,
12133                          "Create new block %d to sink mask stores.",
12134                          store_bb->index);
12135       /* Create vector comparison with boolean result.  */
12136       vectype = TREE_TYPE (mask);
12137       zero = build_zero_cst (vectype);
12138       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12139       gsi = gsi_last_bb (bb);
12140       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12141       /* Create new PHI node for vdef of the last masked store:
12142          .MEM_2 = VDEF <.MEM_1>
12143          will be converted to
12144          .MEM.3 = VDEF <.MEM_1>
12145          and new PHI node will be created in join bb
12146          .MEM_2 = PHI <.MEM_1, .MEM_3>
12147       */
12148       vdef = gimple_vdef (last);
12149       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12150       gimple_set_vdef (last, new_vdef);
12151       phi = create_phi_node (vdef, join_bb);
12152       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12153
12154       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12155       while (true)
12156         {
12157           gimple_stmt_iterator gsi_from;
12158           gimple *stmt1 = NULL;
12159
12160           /* Move masked store to STORE_BB.  */
12161           last_store = last;
12162           gsi = gsi_for_stmt (last);
12163           gsi_from = gsi;
12164           /* Shift GSI to the previous stmt for further traversal.  */
12165           gsi_prev (&gsi);
12166           gsi_to = gsi_start_bb (store_bb);
12167           gsi_move_before (&gsi_from, &gsi_to);
12168           /* Setup GSI_TO to the non-empty block start.  */
12169           gsi_to = gsi_start_bb (store_bb);
12170           if (dump_enabled_p ())
12171             dump_printf_loc (MSG_NOTE, vect_location,
12172                              "Move stmt to created bb\n%G", last);
12173           /* Move all stored value producers if possible.  */
12174           while (!gsi_end_p (gsi))
12175             {
12176               tree lhs;
12177               imm_use_iterator imm_iter;
12178               use_operand_p use_p;
12179               bool res;
12180
12181               /* Skip debug statements.  */
12182               if (is_gimple_debug (gsi_stmt (gsi)))
12183                 {
12184                   gsi_prev (&gsi);
12185                   continue;
12186                 }
12187               stmt1 = gsi_stmt (gsi);
12188               /* Do not consider statements writing to memory or having
12189                  volatile operand.  */
12190               if (gimple_vdef (stmt1)
12191                   || gimple_has_volatile_ops (stmt1))
12192                 break;
12193               gsi_from = gsi;
12194               gsi_prev (&gsi);
12195               lhs = gimple_get_lhs (stmt1);
12196               if (!lhs)
12197                 break;
12198
12199               /* LHS of vectorized stmt must be SSA_NAME.  */
12200               if (TREE_CODE (lhs) != SSA_NAME)
12201                 break;
12202
12203               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12204                 {
12205                   /* Remove dead scalar statement.  */
12206                   if (has_zero_uses (lhs))
12207                     {
12208                       gsi_remove (&gsi_from, true);
12209                       continue;
12210                     }
12211                 }
12212
12213               /* Check that LHS does not have uses outside of STORE_BB.  */
12214               res = true;
12215               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12216                 {
12217                   gimple *use_stmt;
12218                   use_stmt = USE_STMT (use_p);
12219                   if (is_gimple_debug (use_stmt))
12220                     continue;
12221                   if (gimple_bb (use_stmt) != store_bb)
12222                     {
12223                       res = false;
12224                       break;
12225                     }
12226                 }
12227               if (!res)
12228                 break;
12229
12230               if (gimple_vuse (stmt1)
12231                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12232                 break;
12233
12234               /* Can move STMT1 to STORE_BB.  */
12235               if (dump_enabled_p ())
12236                 dump_printf_loc (MSG_NOTE, vect_location,
12237                                  "Move stmt to created bb\n%G", stmt1);
12238               gsi_move_before (&gsi_from, &gsi_to);
12239               /* Shift GSI_TO for further insertion.  */
12240               gsi_prev (&gsi_to);
12241             }
12242           /* Put other masked stores with the same mask to STORE_BB.  */
12243           if (worklist.is_empty ()
12244               || gimple_call_arg (worklist.last (), 2) != mask
12245               || worklist.last () != stmt1)
12246             break;
12247           last = worklist.pop ();
12248         }
12249       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12250     }
12251 }
12252
12253 /* Decide whether it is possible to use a zero-based induction variable
12254    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12255    the value that the induction variable must be able to hold in order
12256    to ensure that the rgroups eventually have no active vector elements.
12257    Return -1 otherwise.  */
12258
12259 widest_int
12260 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12261 {
12262   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12263   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12264   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12265
12266   /* Calculate the value that the induction variable must be able
12267      to hit in order to ensure that we end the loop with an all-false mask.
12268      This involves adding the maximum number of inactive trailing scalar
12269      iterations.  */
12270   widest_int iv_limit = -1;
12271   if (max_loop_iterations (loop, &iv_limit))
12272     {
12273       if (niters_skip)
12274         {
12275           /* Add the maximum number of skipped iterations to the
12276              maximum iteration count.  */
12277           if (TREE_CODE (niters_skip) == INTEGER_CST)
12278             iv_limit += wi::to_widest (niters_skip);
12279           else
12280             iv_limit += max_vf - 1;
12281         }
12282       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12283         /* Make a conservatively-correct assumption.  */
12284         iv_limit += max_vf - 1;
12285
12286       /* IV_LIMIT is the maximum number of latch iterations, which is also
12287          the maximum in-range IV value.  Round this value down to the previous
12288          vector alignment boundary and then add an extra full iteration.  */
12289       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12290       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12291     }
12292   return iv_limit;
12293 }
12294
12295 /* For the given rgroup_controls RGC, check whether an induction variable
12296    would ever hit a value that produces a set of all-false masks or zero
12297    lengths before wrapping around.  Return true if it's possible to wrap
12298    around before hitting the desirable value, otherwise return false.  */
12299
12300 bool
12301 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12302 {
12303   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12304
12305   if (iv_limit == -1)
12306     return true;
12307
12308   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12309   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12310   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12311
12312   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12313     return true;
12314
12315   return false;
12316 }