gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       /* For now vect_get_loop_mask only supports integer mode masks
1466          when we need to split it.  */
1467       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1469         {
1470           ok = false;
1471           break;
1472         }
1473
1474       /* If iv_type is usable as compare type use that - we can elide the
1475          saturation in that case.   */
1476       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1477         {
1478           tree cmp_vectype
1479             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1480           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481             rgc.compare_type = cmp_vectype;
1482         }
1483       if (!rgc.compare_type)
1484         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1485           {
1486             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1487             if (cmp_bits >= min_ni_width
1488                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1489               {
1490                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491                 if (!cmp_type)
1492                   continue;
1493
1494                 /* Check whether we can produce the mask with cmp_type.  */
1495                 tree cmp_vectype
1496                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1497                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1498                   {
1499                     rgc.compare_type = cmp_vectype;
1500                     break;
1501                   }
1502               }
1503         }
1504       if (!rgc.compare_type)
1505         {
1506           ok = false;
1507           break;
1508         }
1509     }
1510   if (!ok)
1511     {
1512       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513       return false;
1514     }
1515
1516   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519   return true;
1520 }
1521
1522 /* Check whether we can use vector access with length based on precison
1523    comparison.  So far, to keep it simple, we only allow the case that the
1524    precision of the target supported length is larger than the precision
1525    required by loop niters.  */
1526
1527 static bool
1528 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1529 {
1530   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531     return false;
1532
1533   machine_mode len_load_mode, len_store_mode;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535          .exists (&len_load_mode))
1536     return false;
1537   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538          .exists (&len_store_mode))
1539     return false;
1540
1541   signed char partial_load_bias = internal_len_load_store_bias
1542     (IFN_LEN_LOAD, len_load_mode);
1543
1544   signed char partial_store_bias = internal_len_load_store_bias
1545     (IFN_LEN_STORE, len_store_mode);
1546
1547   gcc_assert (partial_load_bias == partial_store_bias);
1548
1549   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550     return false;
1551
1552   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553      len_loads with a length of zero.  In order to avoid that we prohibit
1554      more than one loop length here.  */
1555   if (partial_load_bias == -1
1556       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557     return false;
1558
1559   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1560
1561   unsigned int max_nitems_per_iter = 1;
1562   unsigned int i;
1563   rgroup_controls *rgl;
1564   /* Find the maximum number of items per iteration for every rgroup.  */
1565   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1566     {
1567       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1569     }
1570
1571   /* Work out how many bits we need to represent the length limit.  */
1572   unsigned int min_ni_prec
1573     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1574
1575   /* Now use the maximum of below precisions for one suitable IV type:
1576      - the IV's natural precision
1577      - the precision needed to hold: the maximum number of scalar
1578        iterations multiplied by the scale factor (min_ni_prec above)
1579      - the Pmode precision
1580
1581      If min_ni_prec is less than the precision of the current niters,
1582      we perfer to still use the niters type.  Prefer to use Pmode and
1583      wider IV to avoid narrow conversions.  */
1584
1585   unsigned int ni_prec
1586     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587   min_ni_prec = MAX (min_ni_prec, ni_prec);
1588   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1589
1590   tree iv_type = NULL_TREE;
1591   opt_scalar_int_mode tmode_iter;
1592   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1593     {
1594       scalar_mode tmode = tmode_iter.require ();
1595       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1596
1597       /* ??? Do we really want to construct one IV whose precision exceeds
1598          BITS_PER_WORD?  */
1599       if (tbits > BITS_PER_WORD)
1600         break;
1601
1602       /* Find the first available standard integral type.  */
1603       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1604         {
1605           iv_type = build_nonstandard_integer_type (tbits, true);
1606           break;
1607         }
1608     }
1609
1610   if (!iv_type)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614                          "can't vectorize with length-based partial vectors"
1615                          " because there is no suitable iv type.\n");
1616       return false;
1617     }
1618
1619   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1622
1623   return true;
1624 }
1625
1626 /* Calculate the cost of one scalar iteration of the loop.  */
1627 static void
1628 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1629 {
1630   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632   int nbbs = loop->num_nodes, factor;
1633   int innerloop_iters, i;
1634
1635   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1636
1637   /* Gather costs for statements in the scalar loop.  */
1638
1639   /* FORNOW.  */
1640   innerloop_iters = 1;
1641   if (loop->inner)
1642     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       gimple_stmt_iterator si;
1647       basic_block bb = bbs[i];
1648
1649       if (bb->loop_father == loop->inner)
1650         factor = innerloop_iters;
1651       else
1652         factor = 1;
1653
1654       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1655         {
1656           gimple *stmt = gsi_stmt (si);
1657           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1658
1659           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1660             continue;
1661
1662           /* Skip stmts that are not vectorized inside the loop.  */
1663           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665               && (!STMT_VINFO_LIVE_P (vstmt_info)
1666                   || !VECTORIZABLE_CYCLE_DEF
1667                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668             continue;
1669
1670           vect_cost_for_stmt kind;
1671           if (STMT_VINFO_DATA_REF (stmt_info))
1672             {
1673               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674                kind = scalar_load;
1675              else
1676                kind = scalar_store;
1677             }
1678           else if (vect_nop_conversion_p (stmt_info))
1679             continue;
1680           else
1681             kind = scalar_stmt;
1682
1683           /* We are using vect_prologue here to avoid scaling twice
1684              by the inner loop factor.  */
1685           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686                             factor, kind, stmt_info, 0, vect_prologue);
1687         }
1688     }
1689
1690   /* Now accumulate cost.  */
1691   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1692   add_stmt_costs (loop_vinfo->scalar_costs,
1693                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694   loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 }
1696
1697
1698 /* Function vect_analyze_loop_form.
1699
1700    Verify that certain CFG restrictions hold, including:
1701    - the loop has a pre-header
1702    - the loop has a single entry and exit
1703    - the loop exit condition is simple enough
1704    - the number of iterations can be analyzed, i.e, a countable loop.  The
1705      niter could be analyzed under some assumptions.  */
1706
1707 opt_result
1708 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1709 {
1710   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1711
1712   edge exit_e = vec_init_loop_exit_info (loop);
1713   if (!exit_e)
1714     return opt_result::failure_at (vect_location,
1715                                    "not vectorized:"
1716                                    " could not determine main exit from"
1717                                    " loop with multiple exits.\n");
1718   info->loop_exit = exit_e;
1719   if (dump_enabled_p ())
1720       dump_printf_loc (MSG_NOTE, vect_location,
1721                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1722                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1723
1724   /* Different restrictions apply when we are considering an inner-most loop,
1725      vs. an outer (nested) loop.
1726      (FORNOW. May want to relax some of these restrictions in the future).  */
1727
1728   info->inner_loop_cond = NULL;
1729   if (!loop->inner)
1730     {
1731       /* Inner-most loop.  We currently require that the number of BBs is
1732          exactly 2 (the header and latch).  Vectorizable inner-most loops
1733          look like this:
1734
1735                         (pre-header)
1736                            |
1737                           header <--------+
1738                            | |            |
1739                            | +--> latch --+
1740                            |
1741                         (exit-bb)  */
1742
1743       if (loop->num_nodes != 2)
1744         return opt_result::failure_at (vect_location,
1745                                        "not vectorized:"
1746                                        " control flow in loop.\n");
1747
1748       if (empty_block_p (loop->header))
1749         return opt_result::failure_at (vect_location,
1750                                        "not vectorized: empty loop.\n");
1751     }
1752   else
1753     {
1754       class loop *innerloop = loop->inner;
1755       edge entryedge;
1756
1757       /* Nested loop. We currently require that the loop is doubly-nested,
1758          contains a single inner loop, and the number of BBs is exactly 5.
1759          Vectorizable outer-loops look like this:
1760
1761                         (pre-header)
1762                            |
1763                           header <---+
1764                            |         |
1765                           inner-loop |
1766                            |         |
1767                           tail ------+
1768                            |
1769                         (exit-bb)
1770
1771          The inner-loop has the properties expected of inner-most loops
1772          as described above.  */
1773
1774       if ((loop->inner)->inner || (loop->inner)->next)
1775         return opt_result::failure_at (vect_location,
1776                                        "not vectorized:"
1777                                        " multiple nested loops.\n");
1778
1779       if (loop->num_nodes != 5)
1780         return opt_result::failure_at (vect_location,
1781                                        "not vectorized:"
1782                                        " control flow in loop.\n");
1783
1784       entryedge = loop_preheader_edge (innerloop);
1785       if (entryedge->src != loop->header
1786           || !single_exit (innerloop)
1787           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788         return opt_result::failure_at (vect_location,
1789                                        "not vectorized:"
1790                                        " unsupported outerloop form.\n");
1791
1792       /* Analyze the inner-loop.  */
1793       vect_loop_form_info inner;
1794       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1795       if (!res)
1796         {
1797           if (dump_enabled_p ())
1798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799                              "not vectorized: Bad inner loop.\n");
1800           return res;
1801         }
1802
1803       /* Don't support analyzing niter under assumptions for inner
1804          loop.  */
1805       if (!integer_onep (inner.assumptions))
1806         return opt_result::failure_at (vect_location,
1807                                        "not vectorized: Bad inner loop.\n");
1808
1809       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810         return opt_result::failure_at (vect_location,
1811                                        "not vectorized: inner-loop count not"
1812                                        " invariant.\n");
1813
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_NOTE, vect_location,
1816                          "Considering outer-loop vectorization.\n");
1817       info->inner_loop_cond = inner.conds[0];
1818     }
1819
1820   if (!single_exit (loop))
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized: multiple exits.\n");
1823   if (EDGE_COUNT (loop->header->preds) != 2)
1824     return opt_result::failure_at (vect_location,
1825                                    "not vectorized:"
1826                                    " too many incoming edges.\n");
1827
1828   /* We assume that the loop exit condition is at the end of the loop. i.e,
1829      that the loop is represented as a do-while (with a proper if-guard
1830      before the loop if needed), where the loop header contains all the
1831      executable statements, and the latch is empty.  */
1832   if (!empty_block_p (loop->latch)
1833       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1834     return opt_result::failure_at (vect_location,
1835                                    "not vectorized: latch block not empty.\n");
1836
1837   /* Make sure the exit is not abnormal.  */
1838   if (exit_e->flags & EDGE_ABNORMAL)
1839     return opt_result::failure_at (vect_location,
1840                                    "not vectorized:"
1841                                    " abnormal loop exit edge.\n");
1842
1843   info->conds
1844     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1845                             &info->number_of_iterations,
1846                             &info->number_of_iterationsm1);
1847
1848   if (info->conds.is_empty ())
1849     return opt_result::failure_at
1850       (vect_location,
1851        "not vectorized: complicated exit condition.\n");
1852
1853   /* Determine what the primary and alternate exit conds are.  */
1854   for (unsigned i = 0; i < info->conds.length (); i++)
1855     {
1856       gcond *cond = info->conds[i];
1857       if (exit_e->src == gimple_bb (cond))
1858         std::swap (info->conds[0], info->conds[i]);
1859     }
1860
1861   if (integer_zerop (info->assumptions)
1862       || !info->number_of_iterations
1863       || chrec_contains_undetermined (info->number_of_iterations))
1864     return opt_result::failure_at
1865       (info->conds[0],
1866        "not vectorized: number of iterations cannot be computed.\n");
1867
1868   if (integer_zerop (info->number_of_iterations))
1869     return opt_result::failure_at
1870       (info->conds[0],
1871        "not vectorized: number of iterations = 0.\n");
1872
1873   if (!(tree_fits_shwi_p (info->number_of_iterations)
1874         && tree_to_shwi (info->number_of_iterations) > 0))
1875     {
1876       if (dump_enabled_p ())
1877         {
1878           dump_printf_loc (MSG_NOTE, vect_location,
1879                            "Symbolic number of iterations is ");
1880           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881           dump_printf (MSG_NOTE, "\n");
1882         }
1883     }
1884
1885   return opt_result::success ();
1886 }
1887
1888 /* Create a loop_vec_info for LOOP with SHARED and the
1889    vect_analyze_loop_form result.  */
1890
1891 loop_vec_info
1892 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893                         const vect_loop_form_info *info,
1894                         loop_vec_info main_loop_info)
1895 {
1896   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901   /* Also record the assumptions for versioning.  */
1902   if (!integer_onep (info->assumptions) && !main_loop_info)
1903     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1904
1905   for (gcond *cond : info->conds)
1906     {
1907       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1909     }
1910
1911   for (unsigned i = 1; i < info->conds.length (); i ++)
1912     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1913   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1914
1915   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1916
1917   if (info->inner_loop_cond)
1918     {
1919       stmt_vec_info inner_loop_cond_info
1920         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922       /* If we have an estimate on the number of iterations of the inner
1923          loop use that to limit the scale for costing, otherwise use
1924          --param vect-inner-loop-cost-factor literally.  */
1925       widest_int nit;
1926       if (estimated_stmt_executions (loop->inner, &nit))
1927         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1929     }
1930
1931   return loop_vinfo;
1932 }
1933
1934
1935
1936 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937    statements update the vectorization factor.  */
1938
1939 static void
1940 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1941 {
1942   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944   int nbbs = loop->num_nodes;
1945   poly_uint64 vectorization_factor;
1946   int i;
1947
1948   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1949
1950   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951   gcc_assert (known_ne (vectorization_factor, 0U));
1952
1953   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954      vectorization factor of the loop is the unrolling factor required by
1955      the SLP instances.  If that unrolling factor is 1, we say, that we
1956      perform pure SLP on loop - cross iteration parallelism is not
1957      exploited.  */
1958   bool only_slp_in_loop = true;
1959   for (i = 0; i < nbbs; i++)
1960     {
1961       basic_block bb = bbs[i];
1962       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1963            gsi_next (&si))
1964         {
1965           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966           if (!stmt_info)
1967             continue;
1968           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970               && !PURE_SLP_STMT (stmt_info))
1971             /* STMT needs both SLP and loop-based vectorization.  */
1972             only_slp_in_loop = false;
1973         }
1974       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1975            gsi_next (&si))
1976         {
1977           if (is_gimple_debug (gsi_stmt (si)))
1978             continue;
1979           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1980           stmt_info = vect_stmt_to_vectorize (stmt_info);
1981           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983               && !PURE_SLP_STMT (stmt_info))
1984             /* STMT needs both SLP and loop-based vectorization.  */
1985             only_slp_in_loop = false;
1986         }
1987     }
1988
1989   if (only_slp_in_loop)
1990     {
1991       if (dump_enabled_p ())
1992         dump_printf_loc (MSG_NOTE, vect_location,
1993                          "Loop contains only SLP stmts\n");
1994       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1995     }
1996   else
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_NOTE, vect_location,
2000                          "Loop contains SLP and non-SLP stmts\n");
2001       /* Both the vectorization factor and unroll factor have the form
2002          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003          so they must have a common multiple.  */
2004       vectorization_factor
2005         = force_common_multiple (vectorization_factor,
2006                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2007     }
2008
2009   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010   if (dump_enabled_p ())
2011     {
2012       dump_printf_loc (MSG_NOTE, vect_location,
2013                        "Updating vectorization factor to ");
2014       dump_dec (MSG_NOTE, vectorization_factor);
2015       dump_printf (MSG_NOTE, ".\n");
2016     }
2017 }
2018
2019 /* Return true if STMT_INFO describes a double reduction phi and if
2020    the other phi in the reduction is also relevant for vectorization.
2021    This rejects cases such as:
2022
2023       outer1:
2024         x_1 = PHI <x_3(outer2), ...>;
2025         ...
2026
2027       inner:
2028         x_2 = ...;
2029         ...
2030
2031       outer2:
2032         x_3 = PHI <x_2(inner)>;
2033
2034    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2035
2036 static bool
2037 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2038 {
2039   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040     return false;
2041
2042   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2043 }
2044
2045 /* Function vect_analyze_loop_operations.
2046
2047    Scan the loop stmts and make sure they are all vectorizable.  */
2048
2049 static opt_result
2050 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2051 {
2052   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054   int nbbs = loop->num_nodes;
2055   int i;
2056   stmt_vec_info stmt_info;
2057   bool need_to_vectorize = false;
2058   bool ok;
2059
2060   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2061
2062   auto_vec<stmt_info_for_cost> cost_vec;
2063
2064   for (i = 0; i < nbbs; i++)
2065     {
2066       basic_block bb = bbs[i];
2067
2068       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2069            gsi_next (&si))
2070         {
2071           gphi *phi = si.phi ();
2072           ok = true;
2073
2074           stmt_info = loop_vinfo->lookup_stmt (phi);
2075           if (dump_enabled_p ())
2076             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077                              (gimple *) phi);
2078           if (virtual_operand_p (gimple_phi_result (phi)))
2079             continue;
2080
2081           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082              (i.e., a phi in the tail of the outer-loop).  */
2083           if (! is_loop_header_bb_p (bb))
2084             {
2085               /* FORNOW: we currently don't support the case that these phis
2086                  are not used in the outerloop (unless it is double reduction,
2087                  i.e., this phi is vect_reduction_def), cause this case
2088                  requires to actually do something here.  */
2089               if (STMT_VINFO_LIVE_P (stmt_info)
2090                   && !vect_active_double_reduction_p (stmt_info))
2091                 return opt_result::failure_at (phi,
2092                                                "Unsupported loop-closed phi"
2093                                                " in outer-loop.\n");
2094
2095               /* If PHI is used in the outer loop, we check that its operand
2096                  is defined in the inner loop.  */
2097               if (STMT_VINFO_RELEVANT_P (stmt_info))
2098                 {
2099                   tree phi_op;
2100
2101                   if (gimple_phi_num_args (phi) != 1)
2102                     return opt_result::failure_at (phi, "unsupported phi");
2103
2104                   phi_op = PHI_ARG_DEF (phi, 0);
2105                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106                   if (!op_def_info)
2107                     return opt_result::failure_at (phi, "unsupported phi\n");
2108
2109                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110                       && (STMT_VINFO_RELEVANT (op_def_info)
2111                           != vect_used_in_outer_by_reduction))
2112                     return opt_result::failure_at (phi, "unsupported phi\n");
2113
2114                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2116                            == vect_double_reduction_def))
2117                       && !vectorizable_lc_phi (loop_vinfo,
2118                                                stmt_info, NULL, NULL))
2119                     return opt_result::failure_at (phi, "unsupported phi\n");
2120                 }
2121
2122               continue;
2123             }
2124
2125           gcc_assert (stmt_info);
2126
2127           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128                || STMT_VINFO_LIVE_P (stmt_info))
2129               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131             /* A scalar-dependence cycle that we don't support.  */
2132             return opt_result::failure_at (phi,
2133                                            "not vectorized:"
2134                                            " scalar dependence cycle.\n");
2135
2136           if (STMT_VINFO_RELEVANT_P (stmt_info))
2137             {
2138               need_to_vectorize = true;
2139               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140                   && ! PURE_SLP_STMT (stmt_info))
2141                 ok = vectorizable_induction (loop_vinfo,
2142                                              stmt_info, NULL, NULL,
2143                                              &cost_vec);
2144               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2146                             == vect_double_reduction_def)
2147                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148                        && ! PURE_SLP_STMT (stmt_info))
2149                 ok = vectorizable_reduction (loop_vinfo,
2150                                              stmt_info, NULL, NULL, &cost_vec);
2151               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152                         == vect_first_order_recurrence)
2153                        && ! PURE_SLP_STMT (stmt_info))
2154                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155                                            &cost_vec);
2156             }
2157
2158           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2159           if (ok
2160               && STMT_VINFO_LIVE_P (stmt_info)
2161               && !PURE_SLP_STMT (stmt_info))
2162             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163                                               -1, false, &cost_vec);
2164
2165           if (!ok)
2166             return opt_result::failure_at (phi,
2167                                            "not vectorized: relevant phi not "
2168                                            "supported: %G",
2169                                            static_cast <gimple *> (phi));
2170         }
2171
2172       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2173            gsi_next (&si))
2174         {
2175           gimple *stmt = gsi_stmt (si);
2176           if (!gimple_clobber_p (stmt)
2177               && !is_gimple_debug (stmt))
2178             {
2179               opt_result res
2180                 = vect_analyze_stmt (loop_vinfo,
2181                                      loop_vinfo->lookup_stmt (stmt),
2182                                      &need_to_vectorize,
2183                                      NULL, NULL, &cost_vec);
2184               if (!res)
2185                 return res;
2186             }
2187         }
2188     } /* bbs */
2189
2190   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2191
2192   /* All operations in the loop are either irrelevant (deal with loop
2193      control, or dead), or only used outside the loop and can be moved
2194      out of the loop (e.g. invariants, inductions).  The loop can be
2195      optimized away by scalar optimizations.  We're better off not
2196      touching this loop.  */
2197   if (!need_to_vectorize)
2198     {
2199       if (dump_enabled_p ())
2200         dump_printf_loc (MSG_NOTE, vect_location,
2201                          "All the computation can be taken out of the loop.\n");
2202       return opt_result::failure_at
2203         (vect_location,
2204          "not vectorized: redundant loop. no profit to vectorize.\n");
2205     }
2206
2207   return opt_result::success ();
2208 }
2209
2210 /* Return true if we know that the iteration count is smaller than the
2211    vectorization factor.  Return false if it isn't, or if we can't be sure
2212    either way.  */
2213
2214 static bool
2215 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2216 {
2217   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2218
2219   HOST_WIDE_INT max_niter;
2220   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222   else
2223     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2224
2225   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226     return true;
2227
2228   return false;
2229 }
2230
2231 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2232    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2233    definitely no, or -1 if it's worth retrying.  */
2234
2235 static int
2236 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237                            unsigned *suggested_unroll_factor)
2238 {
2239   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2241
2242   /* Only loops that can handle partially-populated vectors can have iteration
2243      counts less than the vectorization factor.  */
2244   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245       && vect_known_niters_smaller_than_vf (loop_vinfo))
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "not vectorized: iteration count smaller than "
2250                          "vectorization factor.\n");
2251       return 0;
2252     }
2253
2254   /* If we know the number of iterations we can do better, for the
2255      epilogue we can also decide whether the main loop leaves us
2256      with enough iterations, prefering a smaller vector epilog then
2257      also possibly used for the case we skip the vector loop.  */
2258   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2259     {
2260       widest_int scalar_niters
2261         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2263         {
2264           loop_vec_info orig_loop_vinfo
2265             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266           unsigned lowest_vf
2267             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268           int prolog_peeling = 0;
2269           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271           if (prolog_peeling >= 0
2272               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273                            lowest_vf))
2274             {
2275               unsigned gap
2276                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278                                % lowest_vf + gap);
2279             }
2280         }
2281       /* Reject vectorizing for a single scalar iteration, even if
2282          we could in principle implement that using partial vectors.  */
2283       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284       if (scalar_niters <= peeling_gap + 1)
2285         {
2286           if (dump_enabled_p ())
2287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288                              "not vectorized: loop only has a single "
2289                              "scalar iteration.\n");
2290           return 0;
2291         }
2292
2293       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294         {
2295           /* Check that the loop processes at least one full vector.  */
2296           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297           if (known_lt (scalar_niters, vf))
2298             {
2299               if (dump_enabled_p ())
2300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                                  "loop does not have enough iterations "
2302                                  "to support vectorization.\n");
2303               return 0;
2304             }
2305
2306           /* If we need to peel an extra epilogue iteration to handle data
2307              accesses with gaps, check that there are enough scalar iterations
2308              available.
2309
2310              The check above is redundant with this one when peeling for gaps,
2311              but the distinction is useful for diagnostics.  */
2312           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313               && known_le (scalar_niters, vf))
2314             {
2315               if (dump_enabled_p ())
2316                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                                  "loop does not have enough iterations "
2318                                  "to support peeling for gaps.\n");
2319               return 0;
2320             }
2321         }
2322     }
2323
2324   /* If using the "very cheap" model. reject cases in which we'd keep
2325      a copy of the scalar code (even if we might be able to vectorize it).  */
2326   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                          "some scalar iterations would need to be peeled\n");
2334       return 0;
2335     }
2336
2337   int min_profitable_iters, min_profitable_estimate;
2338   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339                                       &min_profitable_estimate,
2340                                       suggested_unroll_factor);
2341
2342   if (min_profitable_iters < 0)
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vectorization not profitable.\n");
2347       if (dump_enabled_p ())
2348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349                          "not vectorized: vector version will never be "
2350                          "profitable.\n");
2351       return -1;
2352     }
2353
2354   int min_scalar_loop_bound = (param_min_vect_loop_bound
2355                                * assumed_vf);
2356
2357   /* Use the cost model only if it is more conservative than user specified
2358      threshold.  */
2359   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360                                     min_profitable_iters);
2361
2362   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2363
2364   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                          "not vectorized: vectorization not profitable.\n");
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_NOTE, vect_location,
2372                          "not vectorized: iteration count smaller than user "
2373                          "specified loop bound parameter or minimum profitable "
2374                          "iterations (whichever is more conservative).\n");
2375       return 0;
2376     }
2377
2378   /* The static profitablity threshold min_profitable_estimate includes
2379      the cost of having to check at runtime whether the scalar loop
2380      should be used instead.  If it turns out that we don't need or want
2381      such a check, the threshold we should use for the static estimate
2382      is simply the point at which the vector loop becomes more profitable
2383      than the scalar loop.  */
2384   if (min_profitable_estimate > min_profitable_iters
2385       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2389     {
2390       if (dump_enabled_p ())
2391         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392                          " choice between the scalar and vector loops\n");
2393       min_profitable_estimate = min_profitable_iters;
2394     }
2395
2396   /* If the vector loop needs multiple iterations to be beneficial then
2397      things are probably too close to call, and the conservative thing
2398      would be to stick with the scalar code.  */
2399   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2401     {
2402       if (dump_enabled_p ())
2403         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404                          "one iteration of the vector loop would be"
2405                          " more expensive than the equivalent number of"
2406                          " iterations of the scalar loop\n");
2407       return 0;
2408     }
2409
2410   HOST_WIDE_INT estimated_niter;
2411
2412   /* If we are vectorizing an epilogue then we know the maximum number of
2413      scalar iterations it will cover is at least one lower than the
2414      vectorization factor of the main loop.  */
2415   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416     estimated_niter
2417       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418   else
2419     {
2420       estimated_niter = estimated_stmt_executions_int (loop);
2421       if (estimated_niter == -1)
2422         estimated_niter = likely_max_stmt_executions_int (loop);
2423     }
2424   if (estimated_niter != -1
2425       && ((unsigned HOST_WIDE_INT) estimated_niter
2426           < MAX (th, (unsigned) min_profitable_estimate)))
2427     {
2428       if (dump_enabled_p ())
2429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430                          "not vectorized: estimated iteration count too "
2431                          "small.\n");
2432       if (dump_enabled_p ())
2433         dump_printf_loc (MSG_NOTE, vect_location,
2434                          "not vectorized: estimated iteration count smaller "
2435                          "than specified loop bound parameter or minimum "
2436                          "profitable iterations (whichever is more "
2437                          "conservative).\n");
2438       return -1;
2439     }
2440
2441   return 1;
2442 }
2443
2444 static opt_result
2445 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446                            vec<data_reference_p> *datarefs,
2447                            unsigned int *n_stmts)
2448 {
2449   *n_stmts = 0;
2450   for (unsigned i = 0; i < loop->num_nodes; i++)
2451     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2452          !gsi_end_p (gsi); gsi_next (&gsi))
2453       {
2454         gimple *stmt = gsi_stmt (gsi);
2455         if (is_gimple_debug (stmt))
2456           continue;
2457         ++(*n_stmts);
2458         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459                                                         NULL, 0);
2460         if (!res)
2461           {
2462             if (is_gimple_call (stmt) && loop->safelen)
2463               {
2464                 tree fndecl = gimple_call_fndecl (stmt), op;
2465                 if (fndecl == NULL_TREE
2466                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2467                   {
2468                     fndecl = gimple_call_arg (stmt, 0);
2469                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470                     fndecl = TREE_OPERAND (fndecl, 0);
2471                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2472                   }
2473                 if (fndecl != NULL_TREE)
2474                   {
2475                     cgraph_node *node = cgraph_node::get (fndecl);
2476                     if (node != NULL && node->simd_clones != NULL)
2477                       {
2478                         unsigned int j, n = gimple_call_num_args (stmt);
2479                         for (j = 0; j < n; j++)
2480                           {
2481                             op = gimple_call_arg (stmt, j);
2482                             if (DECL_P (op)
2483                                 || (REFERENCE_CLASS_P (op)
2484                                     && get_base_address (op)))
2485                               break;
2486                           }
2487                         op = gimple_call_lhs (stmt);
2488                         /* Ignore #pragma omp declare simd functions
2489                            if they don't have data references in the
2490                            call stmt itself.  */
2491                         if (j == n
2492                             && !(op
2493                                  && (DECL_P (op)
2494                                      || (REFERENCE_CLASS_P (op)
2495                                          && get_base_address (op)))))
2496                           continue;
2497                       }
2498                   }
2499               }
2500             return res;
2501           }
2502         /* If dependence analysis will give up due to the limit on the
2503            number of datarefs stop here and fail fatally.  */
2504         if (datarefs->length ()
2505             > (unsigned)param_loop_max_datarefs_for_datadeps)
2506           return opt_result::failure_at (stmt, "exceeded param "
2507                                          "loop-max-datarefs-for-datadeps\n");
2508       }
2509   return opt_result::success ();
2510 }
2511
2512 /* Look for SLP-only access groups and turn each individual access into its own
2513    group.  */
2514 static void
2515 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2516 {
2517   unsigned int i;
2518   struct data_reference *dr;
2519
2520   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2521
2522   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523   FOR_EACH_VEC_ELT (datarefs, i, dr)
2524     {
2525       gcc_assert (DR_REF (dr));
2526       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2527
2528       /* Check if the load is a part of an interleaving chain.  */
2529       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2530         {
2531           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533           unsigned int group_size = DR_GROUP_SIZE (first_element);
2534
2535           /* Check if SLP-only groups.  */
2536           if (!STMT_SLP_TYPE (stmt_info)
2537               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2538             {
2539               /* Dissolve the group.  */
2540               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2541
2542               stmt_vec_info vinfo = first_element;
2543               while (vinfo)
2544                 {
2545                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548                   DR_GROUP_SIZE (vinfo) = 1;
2549                   if (STMT_VINFO_STRIDED_P (first_element)
2550                       /* We cannot handle stores with gaps.  */
2551                       || DR_IS_WRITE (dr_info->dr))
2552                     {
2553                       STMT_VINFO_STRIDED_P (vinfo) = true;
2554                       DR_GROUP_GAP (vinfo) = 0;
2555                     }
2556                   else
2557                     DR_GROUP_GAP (vinfo) = group_size - 1;
2558                   /* Duplicate and adjust alignment info, it needs to
2559                      be present on each group leader, see dr_misalignment.  */
2560                   if (vinfo != first_element)
2561                     {
2562                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563                       dr_info2->target_alignment = dr_info->target_alignment;
2564                       int misalignment = dr_info->misalignment;
2565                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2566                         {
2567                           HOST_WIDE_INT diff
2568                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570                           unsigned HOST_WIDE_INT align_c
2571                             = dr_info->target_alignment.to_constant ();
2572                           misalignment = (misalignment + diff) % align_c;
2573                         }
2574                       dr_info2->misalignment = misalignment;
2575                     }
2576                   vinfo = next;
2577                 }
2578             }
2579         }
2580     }
2581 }
2582
2583 /* Determine if operating on full vectors for LOOP_VINFO might leave
2584    some scalar iterations still to do.  If so, decide how we should
2585    handle those scalar iterations.  The possibilities are:
2586
2587    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588        In this case:
2589
2590          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592          LOOP_VINFO_PEELING_FOR_NITER == false
2593
2594    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595        to handle the remaining scalar iterations.  In this case:
2596
2597          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598          LOOP_VINFO_PEELING_FOR_NITER == true
2599
2600        There are two choices:
2601
2602        (2a) Consider vectorizing the epilogue loop at the same VF as the
2603             main loop, but using partial vectors instead of full vectors.
2604             In this case:
2605
2606               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2607
2608        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609             In this case:
2610
2611               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2612  */
2613
2614 opt_result
2615 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2616 {
2617   /* Determine whether there would be any scalar iterations left over.  */
2618   bool need_peeling_or_partial_vectors_p
2619     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2620
2621   /* Decide whether to vectorize the loop with partial vectors.  */
2622   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625       && need_peeling_or_partial_vectors_p)
2626     {
2627       /* For partial-vector-usage=1, try to push the handling of partial
2628          vectors to the epilogue, with the main loop continuing to operate
2629          on full vectors.
2630
2631          If we are unrolling we also do not want to use partial vectors. This
2632          is to avoid the overhead of generating multiple masks and also to
2633          avoid having to execute entire iterations of FALSE masked instructions
2634          when dealing with one or less full iterations.
2635
2636          ??? We could then end up failing to use partial vectors if we
2637          decide to peel iterations into a prologue, and if the main loop
2638          then ends up processing fewer than VF iterations.  */
2639       if ((param_vect_partial_vector_usage == 1
2640            || loop_vinfo->suggested_unroll_factor > 1)
2641           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644       else
2645         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2646     }
2647
2648   if (dump_enabled_p ())
2649     dump_printf_loc (MSG_NOTE, vect_location,
2650                      "operating on %s vectors%s.\n",
2651                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652                      ? "partial" : "full",
2653                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654                      ? " for epilogue loop" : "");
2655
2656   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658        && need_peeling_or_partial_vectors_p);
2659
2660   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2661      analysis that we don't know whether the loop is vectorized by partial
2662      vectors (More details see tree-vect-loop-manip.cc).
2663
2664      However, SELECT_VL vectorizaton style should only applied on partial
2665      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2666      number of elements to be process for each iteration.
2667
2668      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2669      if it is not partial vectorized loop.  */
2670   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2671     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2672
2673   return opt_result::success ();
2674 }
2675
2676 /* Function vect_analyze_loop_2.
2677
2678    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2679    analyses will record information in some members of LOOP_VINFO.  FATAL
2680    indicates if some analysis meets fatal error.  If one non-NULL pointer
2681    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2682    worked out suggested unroll factor, while one NULL pointer shows it's
2683    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2684    is to hold the slp decision when the suggested unroll factor is worked
2685    out.  */
2686 static opt_result
2687 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2688                      unsigned *suggested_unroll_factor,
2689                      bool& slp_done_for_suggested_uf)
2690 {
2691   opt_result ok = opt_result::success ();
2692   int res;
2693   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2694   poly_uint64 min_vf = 2;
2695   loop_vec_info orig_loop_vinfo = NULL;
2696
2697   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2698      loop_vec_info of the first vectorized loop.  */
2699   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2700     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2701   else
2702     orig_loop_vinfo = loop_vinfo;
2703   gcc_assert (orig_loop_vinfo);
2704
2705   /* The first group of checks is independent of the vector size.  */
2706   fatal = true;
2707
2708   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2709       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2710     return opt_result::failure_at (vect_location,
2711                                    "not vectorized: simd if(0)\n");
2712
2713   /* Find all data references in the loop (which correspond to vdefs/vuses)
2714      and analyze their evolution in the loop.  */
2715
2716   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2717
2718   /* Gather the data references and count stmts in the loop.  */
2719   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2720     {
2721       opt_result res
2722         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2723                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2724                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2725       if (!res)
2726         {
2727           if (dump_enabled_p ())
2728             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729                              "not vectorized: loop contains function "
2730                              "calls or data references that cannot "
2731                              "be analyzed\n");
2732           return res;
2733         }
2734       loop_vinfo->shared->save_datarefs ();
2735     }
2736   else
2737     loop_vinfo->shared->check_datarefs ();
2738
2739   /* Analyze the data references and also adjust the minimal
2740      vectorization factor according to the loads and stores.  */
2741
2742   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2743   if (!ok)
2744     {
2745       if (dump_enabled_p ())
2746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2747                          "bad data references.\n");
2748       return ok;
2749     }
2750
2751   /* Check if we are applying unroll factor now.  */
2752   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2753   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2754
2755   /* If the slp decision is false when suggested unroll factor is worked
2756      out, and we are applying suggested unroll factor, we can simply skip
2757      all slp related analyses this time.  */
2758   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2759
2760   /* Classify all cross-iteration scalar data-flow cycles.
2761      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2762   vect_analyze_scalar_cycles (loop_vinfo, slp);
2763
2764   vect_pattern_recog (loop_vinfo);
2765
2766   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2767
2768   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2769      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2770
2771   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2772   if (!ok)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "bad data access.\n");
2777       return ok;
2778     }
2779
2780   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2781
2782   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2783   if (!ok)
2784     {
2785       if (dump_enabled_p ())
2786         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787                          "unexpected pattern.\n");
2788       return ok;
2789     }
2790
2791   /* While the rest of the analysis below depends on it in some way.  */
2792   fatal = false;
2793
2794   /* Analyze data dependences between the data-refs in the loop
2795      and adjust the maximum vectorization factor according to
2796      the dependences.
2797      FORNOW: fail at the first data dependence that we encounter.  */
2798
2799   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2800   if (!ok)
2801     {
2802       if (dump_enabled_p ())
2803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804                          "bad data dependence.\n");
2805       return ok;
2806     }
2807   if (max_vf != MAX_VECTORIZATION_FACTOR
2808       && maybe_lt (max_vf, min_vf))
2809     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2810   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2811
2812   ok = vect_determine_vectorization_factor (loop_vinfo);
2813   if (!ok)
2814     {
2815       if (dump_enabled_p ())
2816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817                          "can't determine vectorization factor.\n");
2818       return ok;
2819     }
2820
2821   /* Compute the scalar iteration cost.  */
2822   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2823
2824   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2825
2826   if (slp)
2827     {
2828       /* Check the SLP opportunities in the loop, analyze and build
2829          SLP trees.  */
2830       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2831       if (!ok)
2832         return ok;
2833
2834       /* If there are any SLP instances mark them as pure_slp.  */
2835       slp = vect_make_slp_decision (loop_vinfo);
2836       if (slp)
2837         {
2838           /* Find stmts that need to be both vectorized and SLPed.  */
2839           vect_detect_hybrid_slp (loop_vinfo);
2840
2841           /* Update the vectorization factor based on the SLP decision.  */
2842           vect_update_vf_for_slp (loop_vinfo);
2843
2844           /* Optimize the SLP graph with the vectorization factor fixed.  */
2845           vect_optimize_slp (loop_vinfo);
2846
2847           /* Gather the loads reachable from the SLP graph entries.  */
2848           vect_gather_slp_loads (loop_vinfo);
2849         }
2850     }
2851
2852   bool saved_can_use_partial_vectors_p
2853     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2854
2855   /* We don't expect to have to roll back to anything other than an empty
2856      set of rgroups.  */
2857   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2858
2859   /* This is the point where we can re-start analysis with SLP forced off.  */
2860 start_over:
2861
2862   /* Apply the suggested unrolling factor, this was determined by the backend
2863      during finish_cost the first time we ran the analyzis for this
2864      vector mode.  */
2865   if (applying_suggested_uf)
2866     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2867
2868   /* Now the vectorization factor is final.  */
2869   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2870   gcc_assert (known_ne (vectorization_factor, 0U));
2871
2872   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2873     {
2874       dump_printf_loc (MSG_NOTE, vect_location,
2875                        "vectorization_factor = ");
2876       dump_dec (MSG_NOTE, vectorization_factor);
2877       dump_printf (MSG_NOTE, ", niters = %wd\n",
2878                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2879     }
2880
2881   if (max_vf != MAX_VECTORIZATION_FACTOR
2882       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2883     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2884
2885   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2886
2887   /* Analyze the alignment of the data-refs in the loop.
2888      Fail if a data reference is found that cannot be vectorized.  */
2889
2890   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2891   if (!ok)
2892     {
2893       if (dump_enabled_p ())
2894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2895                          "bad data alignment.\n");
2896       return ok;
2897     }
2898
2899   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2900      It is important to call pruning after vect_analyze_data_ref_accesses,
2901      since we use grouping information gathered by interleaving analysis.  */
2902   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2903   if (!ok)
2904     return ok;
2905
2906   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2907      vectorization, since we do not want to add extra peeling or
2908      add versioning for alignment.  */
2909   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2910     /* This pass will decide on using loop versioning and/or loop peeling in
2911        order to enhance the alignment of data references in the loop.  */
2912     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2913   if (!ok)
2914     return ok;
2915
2916   if (slp)
2917     {
2918       /* Analyze operations in the SLP instances.  Note this may
2919          remove unsupported SLP instances which makes the above
2920          SLP kind detection invalid.  */
2921       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2922       vect_slp_analyze_operations (loop_vinfo);
2923       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2924         {
2925           ok = opt_result::failure_at (vect_location,
2926                                        "unsupported SLP instances\n");
2927           goto again;
2928         }
2929
2930       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2931       slp_tree load_node, slp_root;
2932       unsigned i, x;
2933       slp_instance instance;
2934       bool can_use_lanes = true;
2935       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2936         {
2937           slp_root = SLP_INSTANCE_TREE (instance);
2938           int group_size = SLP_TREE_LANES (slp_root);
2939           tree vectype = SLP_TREE_VECTYPE (slp_root);
2940           bool loads_permuted = false;
2941           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2942             {
2943               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2944                 continue;
2945               unsigned j;
2946               stmt_vec_info load_info;
2947               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2948                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2949                   {
2950                     loads_permuted = true;
2951                     break;
2952                   }
2953             }
2954
2955           /* If the loads and stores can be handled with load/store-lane
2956              instructions record it and move on to the next instance.  */
2957           if (loads_permuted
2958               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2959               && vect_store_lanes_supported (vectype, group_size, false)
2960                    != IFN_LAST)
2961             {
2962               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2963                 if (STMT_VINFO_GROUPED_ACCESS
2964                       (SLP_TREE_REPRESENTATIVE (load_node)))
2965                   {
2966                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2967                         (SLP_TREE_REPRESENTATIVE (load_node));
2968                     /* Use SLP for strided accesses (or if we can't
2969                        load-lanes).  */
2970                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2971                         || vect_load_lanes_supported
2972                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2973                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2974                       break;
2975                   }
2976
2977               can_use_lanes
2978                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2979
2980               if (can_use_lanes && dump_enabled_p ())
2981                 dump_printf_loc (MSG_NOTE, vect_location,
2982                                  "SLP instance %p can use load/store-lanes\n",
2983                                  (void *) instance);
2984             }
2985           else
2986             {
2987               can_use_lanes = false;
2988               break;
2989             }
2990         }
2991
2992       /* If all SLP instances can use load/store-lanes abort SLP and try again
2993          with SLP disabled.  */
2994       if (can_use_lanes)
2995         {
2996           ok = opt_result::failure_at (vect_location,
2997                                        "Built SLP cancelled: can use "
2998                                        "load/store-lanes\n");
2999           if (dump_enabled_p ())
3000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                              "Built SLP cancelled: all SLP instances support "
3002                              "load/store-lanes\n");
3003           goto again;
3004         }
3005     }
3006
3007   /* Dissolve SLP-only groups.  */
3008   vect_dissolve_slp_only_groups (loop_vinfo);
3009
3010   /* Scan all the remaining operations in the loop that are not subject
3011      to SLP and make sure they are vectorizable.  */
3012   ok = vect_analyze_loop_operations (loop_vinfo);
3013   if (!ok)
3014     {
3015       if (dump_enabled_p ())
3016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017                          "bad operation or unsupported loop bound.\n");
3018       return ok;
3019     }
3020
3021   /* For now, we don't expect to mix both masking and length approaches for one
3022      loop, disable it if both are recorded.  */
3023   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3024       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3025       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3026     {
3027       if (dump_enabled_p ())
3028         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029                          "can't vectorize a loop with partial vectors"
3030                          " because we don't expect to mix different"
3031                          " approaches with partial vectors for the"
3032                          " same loop.\n");
3033       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3034     }
3035
3036   /* If we still have the option of using partial vectors,
3037      check whether we can generate the necessary loop controls.  */
3038   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3039     {
3040       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3041         {
3042           if (!vect_verify_full_masking (loop_vinfo)
3043               && !vect_verify_full_masking_avx512 (loop_vinfo))
3044             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3045         }
3046       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3047         if (!vect_verify_loop_lens (loop_vinfo))
3048           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3049     }
3050
3051   /* If we're vectorizing a loop that uses length "controls" and
3052      can iterate more than once, we apply decrementing IV approach
3053      in loop control.  */
3054   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3055       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3056       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3057       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3058            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3059                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3060     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3061
3062   /* If a loop uses length controls and has a decrementing loop control IV,
3063      we will normally pass that IV through a MIN_EXPR to calcaluate the
3064      basis for the length controls.  E.g. in a loop that processes one
3065      element per scalar iteration, the number of elements would be
3066      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3067
3068      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3069      step, since only the final iteration of the vector loop can have
3070      inactive lanes.
3071
3072      However, some targets have a dedicated instruction for calculating the
3073      preferred length, given the total number of elements that still need to
3074      be processed.  This is encapsulated in the SELECT_VL internal function.
3075
3076      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3077      to determine the basis for the length controls.  However, unlike the
3078      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3079      lanes inactive in any iteration of the vector loop, not just the last
3080      iteration.  This SELECT_VL approach therefore requires us to use pointer
3081      IVs with variable steps.
3082
3083      Once we've decided how many elements should be processed by one
3084      iteration of the vector loop, we need to populate the rgroup controls.
3085      If a loop has multiple rgroups, we need to make sure that those rgroups
3086      "line up" (that is, they must be consistent about which elements are
3087      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3088
3089      In principle, it would be possible to use vect_adjust_loop_lens_control
3090      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3091      However:
3092
3093      (1) In practice, it only makes sense to use SELECT_VL when a vector
3094          operation will be controlled directly by the result.  It is not
3095          worth using SELECT_VL if it would only be the input to other
3096          calculations.
3097
3098      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3099          pointer IV will need N updates by a variable amount (N-1 updates
3100          within the iteration and 1 update to move to the next iteration).
3101
3102      Because of this, we prefer to use the MIN_EXPR approach whenever there
3103      is more than one length control.
3104
3105      In addition, SELECT_VL always operates to a granularity of 1 unit.
3106      If we wanted to use it to control an SLP operation on N consecutive
3107      elements, we would need to make the SELECT_VL inputs measure scalar
3108      iterations (rather than elements) and then multiply the SELECT_VL
3109      result by N.  But using SELECT_VL this way is inefficient because
3110      of (1) above.
3111
3112      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3113         satisfied:
3114
3115      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3116      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3117
3118      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3119      we will fail to gain benefits of following unroll optimizations. We prefer
3120      using the MIN_EXPR approach in this situation.  */
3121   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3122     {
3123       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3124       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3125                                           OPTIMIZE_FOR_SPEED)
3126           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3127           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3128           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3129               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3130         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3131     }
3132
3133   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3134      assuming that the loop will be used as a main loop.  We will redo
3135      this analysis later if we instead decide to use the loop as an
3136      epilogue loop.  */
3137   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3138   if (!ok)
3139     return ok;
3140
3141   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3142      to be able to handle fewer than VF scalars, or needs to have a lower VF
3143      than the main loop.  */
3144   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3145       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3146     {
3147       poly_uint64 unscaled_vf
3148         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3149                      orig_loop_vinfo->suggested_unroll_factor);
3150       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3151         return opt_result::failure_at (vect_location,
3152                                        "Vectorization factor too high for"
3153                                        " epilogue loop.\n");
3154     }
3155
3156   /* Check the costings of the loop make vectorizing worthwhile.  */
3157   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3158   if (res < 0)
3159     {
3160       ok = opt_result::failure_at (vect_location,
3161                                    "Loop costings may not be worthwhile.\n");
3162       goto again;
3163     }
3164   if (!res)
3165     return opt_result::failure_at (vect_location,
3166                                    "Loop costings not worthwhile.\n");
3167
3168   /* If an epilogue loop is required make sure we can create one.  */
3169   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3170       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3171     {
3172       if (dump_enabled_p ())
3173         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3174       if (!vect_can_advance_ivs_p (loop_vinfo)
3175           || !slpeel_can_duplicate_loop_p (loop,
3176                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3177                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3178         {
3179           ok = opt_result::failure_at (vect_location,
3180                                        "not vectorized: can't create required "
3181                                        "epilog loop\n");
3182           goto again;
3183         }
3184     }
3185
3186   /* During peeling, we need to check if number of loop iterations is
3187      enough for both peeled prolog loop and vector loop.  This check
3188      can be merged along with threshold check of loop versioning, so
3189      increase threshold for this case if necessary.
3190
3191      If we are analyzing an epilogue we still want to check what its
3192      versioning threshold would be.  If we decide to vectorize the epilogues we
3193      will want to use the lowest versioning threshold of all epilogues and main
3194      loop.  This will enable us to enter a vectorized epilogue even when
3195      versioning the loop.  We can't simply check whether the epilogue requires
3196      versioning though since we may have skipped some versioning checks when
3197      analyzing the epilogue.  For instance, checks for alias versioning will be
3198      skipped when dealing with epilogues as we assume we already checked them
3199      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3200   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3201     {
3202       poly_uint64 niters_th = 0;
3203       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3204
3205       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3206         {
3207           /* Niters for peeled prolog loop.  */
3208           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3209             {
3210               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3211               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3212               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3213             }
3214           else
3215             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3216         }
3217
3218       /* Niters for at least one iteration of vectorized loop.  */
3219       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3220         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3221       /* One additional iteration because of peeling for gap.  */
3222       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3223         niters_th += 1;
3224
3225       /*  Use the same condition as vect_transform_loop to decide when to use
3226           the cost to determine a versioning threshold.  */
3227       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3228           && ordered_p (th, niters_th))
3229         niters_th = ordered_max (poly_uint64 (th), niters_th);
3230
3231       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3232     }
3233
3234   gcc_assert (known_eq (vectorization_factor,
3235                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3236
3237   slp_done_for_suggested_uf = slp;
3238
3239   /* Ok to vectorize!  */
3240   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3241   return opt_result::success ();
3242
3243 again:
3244   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3245   gcc_assert (!ok);
3246
3247   /* Try again with SLP forced off but if we didn't do any SLP there is
3248      no point in re-trying.  */
3249   if (!slp)
3250     return ok;
3251
3252   /* If the slp decision is true when suggested unroll factor is worked
3253      out, and we are applying suggested unroll factor, we don't need to
3254      re-try any more.  */
3255   if (applying_suggested_uf && slp_done_for_suggested_uf)
3256     return ok;
3257
3258   /* If there are reduction chains re-trying will fail anyway.  */
3259   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3260     return ok;
3261
3262   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3263      via interleaving or lane instructions.  */
3264   slp_instance instance;
3265   slp_tree node;
3266   unsigned i, j;
3267   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3268     {
3269       stmt_vec_info vinfo;
3270       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3271       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3272         continue;
3273       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3274       unsigned int size = DR_GROUP_SIZE (vinfo);
3275       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3276       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3277          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3278          && ! vect_grouped_store_supported (vectype, size))
3279         return opt_result::failure_at (vinfo->stmt,
3280                                        "unsupported grouped store\n");
3281       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3282         {
3283           vinfo = SLP_TREE_REPRESENTATIVE (node);
3284           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3285             {
3286               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3287               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3288               size = DR_GROUP_SIZE (vinfo);
3289               vectype = STMT_VINFO_VECTYPE (vinfo);
3290               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3291                   && ! vect_grouped_load_supported (vectype, single_element_p,
3292                                                     size))
3293                 return opt_result::failure_at (vinfo->stmt,
3294                                                "unsupported grouped load\n");
3295             }
3296         }
3297     }
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "re-trying with SLP disabled\n");
3302
3303   /* Roll back state appropriately.  No SLP this time.  */
3304   slp = false;
3305   /* Restore vectorization factor as it were without SLP.  */
3306   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3307   /* Free the SLP instances.  */
3308   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3309     vect_free_slp_instance (instance);
3310   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3311   /* Reset SLP type to loop_vect on all stmts.  */
3312   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3313     {
3314       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3315       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3316            !gsi_end_p (si); gsi_next (&si))
3317         {
3318           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3319           STMT_SLP_TYPE (stmt_info) = loop_vect;
3320           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3321               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3322             {
3323               /* vectorizable_reduction adjusts reduction stmt def-types,
3324                  restore them to that of the PHI.  */
3325               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3326                 = STMT_VINFO_DEF_TYPE (stmt_info);
3327               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3329                 = STMT_VINFO_DEF_TYPE (stmt_info);
3330             }
3331         }
3332       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3333            !gsi_end_p (si); gsi_next (&si))
3334         {
3335           if (is_gimple_debug (gsi_stmt (si)))
3336             continue;
3337           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3338           STMT_SLP_TYPE (stmt_info) = loop_vect;
3339           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3340             {
3341               stmt_vec_info pattern_stmt_info
3342                 = STMT_VINFO_RELATED_STMT (stmt_info);
3343               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3344                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3345
3346               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3347               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3348               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3349                    !gsi_end_p (pi); gsi_next (&pi))
3350                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3351                   = loop_vect;
3352             }
3353         }
3354     }
3355   /* Free optimized alias test DDRS.  */
3356   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3357   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3358   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3359   /* Reset target cost data.  */
3360   delete loop_vinfo->vector_costs;
3361   loop_vinfo->vector_costs = nullptr;
3362   /* Reset accumulated rgroup information.  */
3363   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3364   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3365   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3366   /* Reset assorted flags.  */
3367   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3368   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3369   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3370   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3371   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3372     = saved_can_use_partial_vectors_p;
3373
3374   goto start_over;
3375 }
3376
3377 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3378    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3379    OLD_LOOP_VINFO is better unless something specifically indicates
3380    otherwise.
3381
3382    Note that this deliberately isn't a partial order.  */
3383
3384 static bool
3385 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3386                           loop_vec_info old_loop_vinfo)
3387 {
3388   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3389   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3390
3391   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3392   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3393
3394   /* Always prefer a VF of loop->simdlen over any other VF.  */
3395   if (loop->simdlen)
3396     {
3397       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3398       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3399       if (new_simdlen_p != old_simdlen_p)
3400         return new_simdlen_p;
3401     }
3402
3403   const auto *old_costs = old_loop_vinfo->vector_costs;
3404   const auto *new_costs = new_loop_vinfo->vector_costs;
3405   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3406     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3407
3408   return new_costs->better_main_loop_than_p (old_costs);
3409 }
3410
3411 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3412    true if we should.  */
3413
3414 static bool
3415 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3416                         loop_vec_info old_loop_vinfo)
3417 {
3418   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3419     return false;
3420
3421   if (dump_enabled_p ())
3422     dump_printf_loc (MSG_NOTE, vect_location,
3423                      "***** Preferring vector mode %s to vector mode %s\n",
3424                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3425                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3426   return true;
3427 }
3428
3429 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3430    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3431    MODE_I to the next mode useful to analyze.
3432    Return the loop_vinfo on success and wrapped null on failure.  */
3433
3434 static opt_loop_vec_info
3435 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3436                      const vect_loop_form_info *loop_form_info,
3437                      loop_vec_info main_loop_vinfo,
3438                      const vector_modes &vector_modes, unsigned &mode_i,
3439                      machine_mode &autodetected_vector_mode,
3440                      bool &fatal)
3441 {
3442   loop_vec_info loop_vinfo
3443     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3444
3445   machine_mode vector_mode = vector_modes[mode_i];
3446   loop_vinfo->vector_mode = vector_mode;
3447   unsigned int suggested_unroll_factor = 1;
3448   bool slp_done_for_suggested_uf = false;
3449
3450   /* Run the main analysis.  */
3451   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3452                                         &suggested_unroll_factor,
3453                                         slp_done_for_suggested_uf);
3454   if (dump_enabled_p ())
3455     dump_printf_loc (MSG_NOTE, vect_location,
3456                      "***** Analysis %s with vector mode %s\n",
3457                      res ? "succeeded" : " failed",
3458                      GET_MODE_NAME (loop_vinfo->vector_mode));
3459
3460   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3461     {
3462       if (dump_enabled_p ())
3463         dump_printf_loc (MSG_NOTE, vect_location,
3464                          "***** Re-trying analysis for unrolling"
3465                          " with unroll factor %d and slp %s.\n",
3466                          suggested_unroll_factor,
3467                          slp_done_for_suggested_uf ? "on" : "off");
3468       loop_vec_info unroll_vinfo
3469         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3470       unroll_vinfo->vector_mode = vector_mode;
3471       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3472       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3473                                                 slp_done_for_suggested_uf);
3474       if (new_res)
3475         {
3476           delete loop_vinfo;
3477           loop_vinfo = unroll_vinfo;
3478         }
3479       else
3480         delete unroll_vinfo;
3481     }
3482
3483   /* Remember the autodetected vector mode.  */
3484   if (vector_mode == VOIDmode)
3485     autodetected_vector_mode = loop_vinfo->vector_mode;
3486
3487   /* Advance mode_i, first skipping modes that would result in the
3488      same analysis result.  */
3489   while (mode_i + 1 < vector_modes.length ()
3490          && vect_chooses_same_modes_p (loop_vinfo,
3491                                        vector_modes[mode_i + 1]))
3492     {
3493       if (dump_enabled_p ())
3494         dump_printf_loc (MSG_NOTE, vect_location,
3495                          "***** The result for vector mode %s would"
3496                          " be the same\n",
3497                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3498       mode_i += 1;
3499     }
3500   if (mode_i + 1 < vector_modes.length ()
3501       && VECTOR_MODE_P (autodetected_vector_mode)
3502       && (related_vector_mode (vector_modes[mode_i + 1],
3503                                GET_MODE_INNER (autodetected_vector_mode))
3504           == autodetected_vector_mode)
3505       && (related_vector_mode (autodetected_vector_mode,
3506                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3507           == vector_modes[mode_i + 1]))
3508     {
3509       if (dump_enabled_p ())
3510         dump_printf_loc (MSG_NOTE, vect_location,
3511                          "***** Skipping vector mode %s, which would"
3512                          " repeat the analysis for %s\n",
3513                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3514                          GET_MODE_NAME (autodetected_vector_mode));
3515       mode_i += 1;
3516     }
3517   mode_i++;
3518
3519   if (!res)
3520     {
3521       delete loop_vinfo;
3522       if (fatal)
3523         gcc_checking_assert (main_loop_vinfo == NULL);
3524       return opt_loop_vec_info::propagate_failure (res);
3525     }
3526
3527   return opt_loop_vec_info::success (loop_vinfo);
3528 }
3529
3530 /* Function vect_analyze_loop.
3531
3532    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3533    for it.  The different analyses will record information in the
3534    loop_vec_info struct.  */
3535 opt_loop_vec_info
3536 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3537 {
3538   DUMP_VECT_SCOPE ("analyze_loop_nest");
3539
3540   if (loop_outer (loop)
3541       && loop_vec_info_for_loop (loop_outer (loop))
3542       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3543     return opt_loop_vec_info::failure_at (vect_location,
3544                                           "outer-loop already vectorized.\n");
3545
3546   if (!find_loop_nest (loop, &shared->loop_nest))
3547     return opt_loop_vec_info::failure_at
3548       (vect_location,
3549        "not vectorized: loop nest containing two or more consecutive inner"
3550        " loops cannot be vectorized\n");
3551
3552   /* Analyze the loop form.  */
3553   vect_loop_form_info loop_form_info;
3554   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3555   if (!res)
3556     {
3557       if (dump_enabled_p ())
3558         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559                          "bad loop form.\n");
3560       return opt_loop_vec_info::propagate_failure (res);
3561     }
3562   if (!integer_onep (loop_form_info.assumptions))
3563     {
3564       /* We consider to vectorize this loop by versioning it under
3565          some assumptions.  In order to do this, we need to clear
3566          existing information computed by scev and niter analyzer.  */
3567       scev_reset_htab ();
3568       free_numbers_of_iterations_estimates (loop);
3569       /* Also set flag for this loop so that following scev and niter
3570          analysis are done under the assumptions.  */
3571       loop_constraint_set (loop, LOOP_C_FINITE);
3572     }
3573
3574   auto_vector_modes vector_modes;
3575   /* Autodetect first vector size we try.  */
3576   vector_modes.safe_push (VOIDmode);
3577   unsigned int autovec_flags
3578     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3579                                                     loop->simdlen != 0);
3580   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3581                              && !unlimited_cost_model (loop));
3582   machine_mode autodetected_vector_mode = VOIDmode;
3583   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3584   unsigned int mode_i = 0;
3585   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3586
3587   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3588      a mode has not been analyzed.  */
3589   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3590   for (unsigned i = 0; i < vector_modes.length (); ++i)
3591     cached_vf_per_mode.safe_push (0);
3592
3593   /* First determine the main loop vectorization mode, either the first
3594      one that works, starting with auto-detecting the vector mode and then
3595      following the targets order of preference, or the one with the
3596      lowest cost if pick_lowest_cost_p.  */
3597   while (1)
3598     {
3599       bool fatal;
3600       unsigned int last_mode_i = mode_i;
3601       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3602          failed.  */
3603       cached_vf_per_mode[last_mode_i] = -1;
3604       opt_loop_vec_info loop_vinfo
3605         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3606                                NULL, vector_modes, mode_i,
3607                                autodetected_vector_mode, fatal);
3608       if (fatal)
3609         break;
3610
3611       if (loop_vinfo)
3612         {
3613           /*  Analyzis has been successful so update the VF value.  The
3614               VF should always be a multiple of unroll_factor and we want to
3615               capture the original VF here.  */
3616           cached_vf_per_mode[last_mode_i]
3617             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3618                          loop_vinfo->suggested_unroll_factor);
3619           /* Once we hit the desired simdlen for the first time,
3620              discard any previous attempts.  */
3621           if (simdlen
3622               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3623             {
3624               delete first_loop_vinfo;
3625               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3626               simdlen = 0;
3627             }
3628           else if (pick_lowest_cost_p
3629                    && first_loop_vinfo
3630                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3631             {
3632               /* Pick loop_vinfo over first_loop_vinfo.  */
3633               delete first_loop_vinfo;
3634               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3635             }
3636           if (first_loop_vinfo == NULL)
3637             first_loop_vinfo = loop_vinfo;
3638           else
3639             {
3640               delete loop_vinfo;
3641               loop_vinfo = opt_loop_vec_info::success (NULL);
3642             }
3643
3644           /* Commit to first_loop_vinfo if we have no reason to try
3645              alternatives.  */
3646           if (!simdlen && !pick_lowest_cost_p)
3647             break;
3648         }
3649       if (mode_i == vector_modes.length ()
3650           || autodetected_vector_mode == VOIDmode)
3651         break;
3652
3653       /* Try the next biggest vector size.  */
3654       if (dump_enabled_p ())
3655         dump_printf_loc (MSG_NOTE, vect_location,
3656                          "***** Re-trying analysis with vector mode %s\n",
3657                          GET_MODE_NAME (vector_modes[mode_i]));
3658     }
3659   if (!first_loop_vinfo)
3660     return opt_loop_vec_info::propagate_failure (res);
3661
3662   if (dump_enabled_p ())
3663     dump_printf_loc (MSG_NOTE, vect_location,
3664                      "***** Choosing vector mode %s\n",
3665                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3666
3667   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3668      enabled, SIMDUID is not set, it is the innermost loop and we have
3669      either already found the loop's SIMDLEN or there was no SIMDLEN to
3670      begin with.
3671      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3672   bool vect_epilogues = (!simdlen
3673                          && loop->inner == NULL
3674                          && param_vect_epilogues_nomask
3675                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3676                          && !loop->simduid);
3677   if (!vect_epilogues)
3678     return first_loop_vinfo;
3679
3680   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3681   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3682
3683   /* For epilogues start the analysis from the first mode.  The motivation
3684      behind starting from the beginning comes from cases where the VECTOR_MODES
3685      array may contain length-agnostic and length-specific modes.  Their
3686      ordering is not guaranteed, so we could end up picking a mode for the main
3687      loop that is after the epilogue's optimal mode.  */
3688   vector_modes[0] = autodetected_vector_mode;
3689   mode_i = 0;
3690
3691   bool supports_partial_vectors =
3692     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3693   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3694
3695   while (1)
3696     {
3697       /* If the target does not support partial vectors we can shorten the
3698          number of modes to analyze for the epilogue as we know we can't pick a
3699          mode that would lead to a VF at least as big as the
3700          FIRST_VINFO_VF.  */
3701       if (!supports_partial_vectors
3702           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3703         {
3704           mode_i++;
3705           if (mode_i == vector_modes.length ())
3706             break;
3707           continue;
3708         }
3709
3710       if (dump_enabled_p ())
3711         dump_printf_loc (MSG_NOTE, vect_location,
3712                          "***** Re-trying epilogue analysis with vector "
3713                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3714
3715       bool fatal;
3716       opt_loop_vec_info loop_vinfo
3717         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3718                                first_loop_vinfo,
3719                                vector_modes, mode_i,
3720                                autodetected_vector_mode, fatal);
3721       if (fatal)
3722         break;
3723
3724       if (loop_vinfo)
3725         {
3726           if (pick_lowest_cost_p)
3727             {
3728               /* Keep trying to roll back vectorization attempts while the
3729                  loop_vec_infos they produced were worse than this one.  */
3730               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3731               while (!vinfos.is_empty ()
3732                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3733                 {
3734                   gcc_assert (vect_epilogues);
3735                   delete vinfos.pop ();
3736                 }
3737             }
3738           /* For now only allow one epilogue loop.  */
3739           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3740             {
3741               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3742               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3743               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3744                           || maybe_ne (lowest_th, 0U));
3745               /* Keep track of the known smallest versioning
3746                  threshold.  */
3747               if (ordered_p (lowest_th, th))
3748                 lowest_th = ordered_min (lowest_th, th);
3749             }
3750           else
3751             {
3752               delete loop_vinfo;
3753               loop_vinfo = opt_loop_vec_info::success (NULL);
3754             }
3755
3756           /* For now only allow one epilogue loop, but allow
3757              pick_lowest_cost_p to replace it, so commit to the
3758              first epilogue if we have no reason to try alternatives.  */
3759           if (!pick_lowest_cost_p)
3760             break;
3761         }
3762
3763       if (mode_i == vector_modes.length ())
3764         break;
3765
3766     }
3767
3768   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3769     {
3770       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3771       if (dump_enabled_p ())
3772         dump_printf_loc (MSG_NOTE, vect_location,
3773                          "***** Choosing epilogue vector mode %s\n",
3774                          GET_MODE_NAME
3775                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3776     }
3777
3778   return first_loop_vinfo;
3779 }
3780
3781 /* Return true if there is an in-order reduction function for CODE, storing
3782    it in *REDUC_FN if so.  */
3783
3784 static bool
3785 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3786 {
3787   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3788      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3789      (-0.0) = -0.0.  */
3790   if (code == PLUS_EXPR || code == MINUS_EXPR)
3791     {
3792       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3793       return true;
3794     }
3795   return false;
3796 }
3797
3798 /* Function reduction_fn_for_scalar_code
3799
3800    Input:
3801    CODE - tree_code of a reduction operations.
3802
3803    Output:
3804    REDUC_FN - the corresponding internal function to be used to reduce the
3805       vector of partial results into a single scalar result, or IFN_LAST
3806       if the operation is a supported reduction operation, but does not have
3807       such an internal function.
3808
3809    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3810
3811 bool
3812 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3813 {
3814   if (code.is_tree_code ())
3815     switch (tree_code (code))
3816       {
3817       case MAX_EXPR:
3818         *reduc_fn = IFN_REDUC_MAX;
3819         return true;
3820
3821       case MIN_EXPR:
3822         *reduc_fn = IFN_REDUC_MIN;
3823         return true;
3824
3825       case PLUS_EXPR:
3826         *reduc_fn = IFN_REDUC_PLUS;
3827         return true;
3828
3829       case BIT_AND_EXPR:
3830         *reduc_fn = IFN_REDUC_AND;
3831         return true;
3832
3833       case BIT_IOR_EXPR:
3834         *reduc_fn = IFN_REDUC_IOR;
3835         return true;
3836
3837       case BIT_XOR_EXPR:
3838         *reduc_fn = IFN_REDUC_XOR;
3839         return true;
3840
3841       case MULT_EXPR:
3842       case MINUS_EXPR:
3843         *reduc_fn = IFN_LAST;
3844         return true;
3845
3846       default:
3847         return false;
3848       }
3849   else
3850     switch (combined_fn (code))
3851       {
3852       CASE_CFN_FMAX:
3853         *reduc_fn = IFN_REDUC_FMAX;
3854         return true;
3855
3856       CASE_CFN_FMIN:
3857         *reduc_fn = IFN_REDUC_FMIN;
3858         return true;
3859
3860       default:
3861         return false;
3862       }
3863 }
3864
3865 /* If there is a neutral value X such that a reduction would not be affected
3866    by the introduction of additional X elements, return that X, otherwise
3867    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3868    of the scalar elements.  If the reduction has just a single initial value
3869    then INITIAL_VALUE is that value, otherwise it is null.
3870    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3871    In that case no signed zero is returned.  */
3872
3873 tree
3874 neutral_op_for_reduction (tree scalar_type, code_helper code,
3875                           tree initial_value, bool as_initial)
3876 {
3877   if (code.is_tree_code ())
3878     switch (tree_code (code))
3879       {
3880       case DOT_PROD_EXPR:
3881       case SAD_EXPR:
3882       case MINUS_EXPR:
3883       case BIT_IOR_EXPR:
3884       case BIT_XOR_EXPR:
3885         return build_zero_cst (scalar_type);
3886       case WIDEN_SUM_EXPR:
3887       case PLUS_EXPR:
3888         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3889           return build_real (scalar_type, dconstm0);
3890         else
3891           return build_zero_cst (scalar_type);
3892
3893       case MULT_EXPR:
3894         return build_one_cst (scalar_type);
3895
3896       case BIT_AND_EXPR:
3897         return build_all_ones_cst (scalar_type);
3898
3899       case MAX_EXPR:
3900       case MIN_EXPR:
3901         return initial_value;
3902
3903       default:
3904         return NULL_TREE;
3905       }
3906   else
3907     switch (combined_fn (code))
3908       {
3909       CASE_CFN_FMIN:
3910       CASE_CFN_FMAX:
3911         return initial_value;
3912
3913       default:
3914         return NULL_TREE;
3915       }
3916 }
3917
3918 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3919    STMT is printed with a message MSG. */
3920
3921 static void
3922 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3923 {
3924   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3925 }
3926
3927 /* Return true if we need an in-order reduction for operation CODE
3928    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3929    overflow must wrap.  */
3930
3931 bool
3932 needs_fold_left_reduction_p (tree type, code_helper code)
3933 {
3934   /* CHECKME: check for !flag_finite_math_only too?  */
3935   if (SCALAR_FLOAT_TYPE_P (type))
3936     {
3937       if (code.is_tree_code ())
3938         switch (tree_code (code))
3939           {
3940           case MIN_EXPR:
3941           case MAX_EXPR:
3942             return false;
3943
3944           default:
3945             return !flag_associative_math;
3946           }
3947       else
3948         switch (combined_fn (code))
3949           {
3950           CASE_CFN_FMIN:
3951           CASE_CFN_FMAX:
3952             return false;
3953
3954           default:
3955             return !flag_associative_math;
3956           }
3957     }
3958
3959   if (INTEGRAL_TYPE_P (type))
3960     return (!code.is_tree_code ()
3961             || !operation_no_trapping_overflow (type, tree_code (code)));
3962
3963   if (SAT_FIXED_POINT_TYPE_P (type))
3964     return true;
3965
3966   return false;
3967 }
3968
3969 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3970    has a handled computation expression.  Store the main reduction
3971    operation in *CODE.  */
3972
3973 static bool
3974 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3975                       tree loop_arg, code_helper *code,
3976                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3977 {
3978   auto_bitmap visited;
3979   tree lookfor = PHI_RESULT (phi);
3980   ssa_op_iter curri;
3981   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3982   while (USE_FROM_PTR (curr) != loop_arg)
3983     curr = op_iter_next_use (&curri);
3984   curri.i = curri.numops;
3985   do
3986     {
3987       path.safe_push (std::make_pair (curri, curr));
3988       tree use = USE_FROM_PTR (curr);
3989       if (use == lookfor)
3990         break;
3991       gimple *def = SSA_NAME_DEF_STMT (use);
3992       if (gimple_nop_p (def)
3993           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3994         {
3995 pop:
3996           do
3997             {
3998               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3999               curri = x.first;
4000               curr = x.second;
4001               do
4002                 curr = op_iter_next_use (&curri);
4003               /* Skip already visited or non-SSA operands (from iterating
4004                  over PHI args).  */
4005               while (curr != NULL_USE_OPERAND_P
4006                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4007                          || ! bitmap_set_bit (visited,
4008                                               SSA_NAME_VERSION
4009                                                 (USE_FROM_PTR (curr)))));
4010             }
4011           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4012           if (curr == NULL_USE_OPERAND_P)
4013             break;
4014         }
4015       else
4016         {
4017           if (gimple_code (def) == GIMPLE_PHI)
4018             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4019           else
4020             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4021           while (curr != NULL_USE_OPERAND_P
4022                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4023                      || ! bitmap_set_bit (visited,
4024                                           SSA_NAME_VERSION
4025                                             (USE_FROM_PTR (curr)))))
4026             curr = op_iter_next_use (&curri);
4027           if (curr == NULL_USE_OPERAND_P)
4028             goto pop;
4029         }
4030     }
4031   while (1);
4032   if (dump_file && (dump_flags & TDF_DETAILS))
4033     {
4034       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4035       unsigned i;
4036       std::pair<ssa_op_iter, use_operand_p> *x;
4037       FOR_EACH_VEC_ELT (path, i, x)
4038         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4039       dump_printf (MSG_NOTE, "\n");
4040     }
4041
4042   /* Check whether the reduction path detected is valid.  */
4043   bool fail = path.length () == 0;
4044   bool neg = false;
4045   int sign = -1;
4046   *code = ERROR_MARK;
4047   for (unsigned i = 1; i < path.length (); ++i)
4048     {
4049       gimple *use_stmt = USE_STMT (path[i].second);
4050       gimple_match_op op;
4051       if (!gimple_extract_op (use_stmt, &op))
4052         {
4053           fail = true;
4054           break;
4055         }
4056       unsigned int opi = op.num_ops;
4057       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4058         {
4059           /* The following make sure we can compute the operand index
4060              easily plus it mostly disallows chaining via COND_EXPR condition
4061              operands.  */
4062           for (opi = 0; opi < op.num_ops; ++opi)
4063             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4064               break;
4065         }
4066       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4067         {
4068           for (opi = 0; opi < op.num_ops; ++opi)
4069             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4070               break;
4071         }
4072       if (opi == op.num_ops)
4073         {
4074           fail = true;
4075           break;
4076         }
4077       op.code = canonicalize_code (op.code, op.type);
4078       if (op.code == MINUS_EXPR)
4079         {
4080           op.code = PLUS_EXPR;
4081           /* Track whether we negate the reduction value each iteration.  */
4082           if (op.ops[1] == op.ops[opi])
4083             neg = ! neg;
4084         }
4085       if (CONVERT_EXPR_CODE_P (op.code)
4086           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4087         ;
4088       else if (*code == ERROR_MARK)
4089         {
4090           *code = op.code;
4091           sign = TYPE_SIGN (op.type);
4092         }
4093       else if (op.code != *code)
4094         {
4095           fail = true;
4096           break;
4097         }
4098       else if ((op.code == MIN_EXPR
4099                 || op.code == MAX_EXPR)
4100                && sign != TYPE_SIGN (op.type))
4101         {
4102           fail = true;
4103           break;
4104         }
4105       /* Check there's only a single stmt the op is used on.  For the
4106          not value-changing tail and the last stmt allow out-of-loop uses.
4107          ???  We could relax this and handle arbitrary live stmts by
4108          forcing a scalar epilogue for example.  */
4109       imm_use_iterator imm_iter;
4110       use_operand_p use_p;
4111       gimple *op_use_stmt;
4112       unsigned cnt = 0;
4113       bool cond_fn_p = op.code.is_internal_fn ()
4114         && (conditional_internal_fn_code (internal_fn (op.code))
4115             != ERROR_MARK);
4116
4117       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4118         {
4119         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4120            op1 twice (once as definition, once as else) in the same operation.
4121            Allow this.  */
4122           if (cond_fn_p && op_use_stmt == use_stmt)
4123             {
4124               gcall *call = as_a<gcall *> (use_stmt);
4125               unsigned else_pos
4126                 = internal_fn_else_index (internal_fn (op.code));
4127
4128               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4129                 {
4130                   if (j == else_pos)
4131                     continue;
4132                   if (gimple_call_arg (call, j) == op.ops[opi])
4133                     cnt++;
4134                 }
4135             }
4136           else if (!is_gimple_debug (op_use_stmt)
4137                    && (*code != ERROR_MARK
4138                        || flow_bb_inside_loop_p (loop,
4139                                                  gimple_bb (op_use_stmt))))
4140             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4141               cnt++;
4142         }
4143
4144       if (cnt != 1)
4145         {
4146           fail = true;
4147           break;
4148         }
4149     }
4150   return ! fail && ! neg && *code != ERROR_MARK;
4151 }
4152
4153 bool
4154 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4155                       tree loop_arg, enum tree_code code)
4156 {
4157   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4158   code_helper code_;
4159   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4160           && code_ == code);
4161 }
4162
4163
4164
4165 /* Function vect_is_simple_reduction
4166
4167    (1) Detect a cross-iteration def-use cycle that represents a simple
4168    reduction computation.  We look for the following pattern:
4169
4170    loop_header:
4171      a1 = phi < a0, a2 >
4172      a3 = ...
4173      a2 = operation (a3, a1)
4174
4175    or
4176
4177    a3 = ...
4178    loop_header:
4179      a1 = phi < a0, a2 >
4180      a2 = operation (a3, a1)
4181
4182    such that:
4183    1. operation is commutative and associative and it is safe to
4184       change the order of the computation
4185    2. no uses for a2 in the loop (a2 is used out of the loop)
4186    3. no uses of a1 in the loop besides the reduction operation
4187    4. no uses of a1 outside the loop.
4188
4189    Conditions 1,4 are tested here.
4190    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4191
4192    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4193    nested cycles.
4194
4195    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4196    reductions:
4197
4198      a1 = phi < a0, a2 >
4199      inner loop (def of a3)
4200      a2 = phi < a3 >
4201
4202    (4) Detect condition expressions, ie:
4203      for (int i = 0; i < N; i++)
4204        if (a[i] < val)
4205         ret_val = a[i];
4206
4207 */
4208
4209 static stmt_vec_info
4210 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4211                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4212 {
4213   gphi *phi = as_a <gphi *> (phi_info->stmt);
4214   gimple *phi_use_stmt = NULL;
4215   imm_use_iterator imm_iter;
4216   use_operand_p use_p;
4217
4218   *double_reduc = false;
4219   *reduc_chain_p = false;
4220   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4221
4222   tree phi_name = PHI_RESULT (phi);
4223   /* ???  If there are no uses of the PHI result the inner loop reduction
4224      won't be detected as possibly double-reduction by vectorizable_reduction
4225      because that tries to walk the PHI arg from the preheader edge which
4226      can be constant.  See PR60382.  */
4227   if (has_zero_uses (phi_name))
4228     return NULL;
4229   class loop *loop = (gimple_bb (phi))->loop_father;
4230   unsigned nphi_def_loop_uses = 0;
4231   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4232     {
4233       gimple *use_stmt = USE_STMT (use_p);
4234       if (is_gimple_debug (use_stmt))
4235         continue;
4236
4237       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4238         {
4239           if (dump_enabled_p ())
4240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4241                              "intermediate value used outside loop.\n");
4242
4243           return NULL;
4244         }
4245
4246       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4247          op1 twice (once as definition, once as else) in the same operation.
4248          Only count it as one. */
4249       if (use_stmt != phi_use_stmt)
4250         {
4251           nphi_def_loop_uses++;
4252           phi_use_stmt = use_stmt;
4253         }
4254     }
4255
4256   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4257   if (TREE_CODE (latch_def) != SSA_NAME)
4258     {
4259       if (dump_enabled_p ())
4260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4261                          "reduction: not ssa_name: %T\n", latch_def);
4262       return NULL;
4263     }
4264
4265   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4266   if (!def_stmt_info
4267       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4268     return NULL;
4269
4270   bool nested_in_vect_loop
4271     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4272   unsigned nlatch_def_loop_uses = 0;
4273   auto_vec<gphi *, 3> lcphis;
4274   bool inner_loop_of_double_reduc = false;
4275   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4276     {
4277       gimple *use_stmt = USE_STMT (use_p);
4278       if (is_gimple_debug (use_stmt))
4279         continue;
4280       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4281         nlatch_def_loop_uses++;
4282       else
4283         {
4284           /* We can have more than one loop-closed PHI.  */
4285           lcphis.safe_push (as_a <gphi *> (use_stmt));
4286           if (nested_in_vect_loop
4287               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4288                   == vect_double_reduction_def))
4289             inner_loop_of_double_reduc = true;
4290         }
4291     }
4292
4293   /* If we are vectorizing an inner reduction we are executing that
4294      in the original order only in case we are not dealing with a
4295      double reduction.  */
4296   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4297     {
4298       if (dump_enabled_p ())
4299         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4300                         "detected nested cycle: ");
4301       return def_stmt_info;
4302     }
4303
4304   /* When the inner loop of a double reduction ends up with more than
4305      one loop-closed PHI we have failed to classify alternate such
4306      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4307   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4308     {
4309       if (dump_enabled_p ())
4310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4311                          "unhandle double reduction\n");
4312       return NULL;
4313     }
4314
4315   /* If this isn't a nested cycle or if the nested cycle reduction value
4316      is used ouside of the inner loop we cannot handle uses of the reduction
4317      value.  */
4318   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4319     {
4320       if (dump_enabled_p ())
4321         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4322                          "reduction used in loop.\n");
4323       return NULL;
4324     }
4325
4326   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4327      defined in the inner loop.  */
4328   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4329     {
4330       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4331       if (gimple_phi_num_args (def_stmt) != 1
4332           || TREE_CODE (op1) != SSA_NAME)
4333         {
4334           if (dump_enabled_p ())
4335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4336                              "unsupported phi node definition.\n");
4337
4338           return NULL;
4339         }
4340
4341       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4342          and the latch definition op1.  */
4343       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4344       if (gimple_bb (def1)
4345           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4346           && loop->inner
4347           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4348           && (is_gimple_assign (def1) || is_gimple_call (def1))
4349           && is_a <gphi *> (phi_use_stmt)
4350           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4351           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4352                                             loop_latch_edge (loop->inner))))
4353         {
4354           if (dump_enabled_p ())
4355             report_vect_op (MSG_NOTE, def_stmt,
4356                             "detected double reduction: ");
4357
4358           *double_reduc = true;
4359           return def_stmt_info;
4360         }
4361
4362       return NULL;
4363     }
4364
4365   /* Look for the expression computing latch_def from then loop PHI result.  */
4366   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4367   code_helper code;
4368   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4369                             path))
4370     {
4371       STMT_VINFO_REDUC_CODE (phi_info) = code;
4372       if (code == COND_EXPR && !nested_in_vect_loop)
4373         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4374
4375       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4376          reduction chain for which the additional restriction is that
4377          all operations in the chain are the same.  */
4378       auto_vec<stmt_vec_info, 8> reduc_chain;
4379       unsigned i;
4380       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4381       for (i = path.length () - 1; i >= 1; --i)
4382         {
4383           gimple *stmt = USE_STMT (path[i].second);
4384           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4385           gimple_match_op op;
4386           if (!gimple_extract_op (stmt, &op))
4387             gcc_unreachable ();
4388           if (gassign *assign = dyn_cast<gassign *> (stmt))
4389             STMT_VINFO_REDUC_IDX (stmt_info)
4390               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4391           else
4392             {
4393               gcall *call = as_a<gcall *> (stmt);
4394               STMT_VINFO_REDUC_IDX (stmt_info)
4395                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4396             }
4397           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4398                                      && (i == 1 || i == path.length () - 1));
4399           if ((op.code != code && !leading_conversion)
4400               /* We can only handle the final value in epilogue
4401                  generation for reduction chains.  */
4402               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4403             is_slp_reduc = false;
4404           /* For reduction chains we support a trailing/leading
4405              conversions.  We do not store those in the actual chain.  */
4406           if (leading_conversion)
4407             continue;
4408           reduc_chain.safe_push (stmt_info);
4409         }
4410       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4411         {
4412           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4413             {
4414               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4415               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4416             }
4417           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4418           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4419
4420           /* Save the chain for further analysis in SLP detection.  */
4421           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4422           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4423
4424           *reduc_chain_p = true;
4425           if (dump_enabled_p ())
4426             dump_printf_loc (MSG_NOTE, vect_location,
4427                             "reduction: detected reduction chain\n");
4428         }
4429       else if (dump_enabled_p ())
4430         dump_printf_loc (MSG_NOTE, vect_location,
4431                          "reduction: detected reduction\n");
4432
4433       return def_stmt_info;
4434     }
4435
4436   if (dump_enabled_p ())
4437     dump_printf_loc (MSG_NOTE, vect_location,
4438                      "reduction: unknown pattern\n");
4439
4440   return NULL;
4441 }
4442
4443 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4444    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4445    or -1 if not known.  */
4446
4447 static int
4448 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4449 {
4450   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4451   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4452     {
4453       if (dump_enabled_p ())
4454         dump_printf_loc (MSG_NOTE, vect_location,
4455                          "cost model: epilogue peel iters set to vf/2 "
4456                          "because loop iterations are unknown .\n");
4457       return assumed_vf / 2;
4458     }
4459   else
4460     {
4461       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4462       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4463       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4464       /* If we need to peel for gaps, but no peeling is required, we have to
4465          peel VF iterations.  */
4466       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4467         peel_iters_epilogue = assumed_vf;
4468       return peel_iters_epilogue;
4469     }
4470 }
4471
4472 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4473 int
4474 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4475                              int *peel_iters_epilogue,
4476                              stmt_vector_for_cost *scalar_cost_vec,
4477                              stmt_vector_for_cost *prologue_cost_vec,
4478                              stmt_vector_for_cost *epilogue_cost_vec)
4479 {
4480   int retval = 0;
4481
4482   *peel_iters_epilogue
4483     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4484
4485   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4486     {
4487       /* If peeled iterations are known but number of scalar loop
4488          iterations are unknown, count a taken branch per peeled loop.  */
4489       if (peel_iters_prologue > 0)
4490         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4491                                    vect_prologue);
4492       if (*peel_iters_epilogue > 0)
4493         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4494                                     vect_epilogue);
4495     }
4496
4497   stmt_info_for_cost *si;
4498   int j;
4499   if (peel_iters_prologue)
4500     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4501       retval += record_stmt_cost (prologue_cost_vec,
4502                                   si->count * peel_iters_prologue,
4503                                   si->kind, si->stmt_info, si->misalign,
4504                                   vect_prologue);
4505   if (*peel_iters_epilogue)
4506     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4507       retval += record_stmt_cost (epilogue_cost_vec,
4508                                   si->count * *peel_iters_epilogue,
4509                                   si->kind, si->stmt_info, si->misalign,
4510                                   vect_epilogue);
4511
4512   return retval;
4513 }
4514
4515 /* Function vect_estimate_min_profitable_iters
4516
4517    Return the number of iterations required for the vector version of the
4518    loop to be profitable relative to the cost of the scalar version of the
4519    loop.
4520
4521    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4522    of iterations for vectorization.  -1 value means loop vectorization
4523    is not profitable.  This returned value may be used for dynamic
4524    profitability check.
4525
4526    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4527    for static check against estimated number of iterations.  */
4528
4529 static void
4530 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4531                                     int *ret_min_profitable_niters,
4532                                     int *ret_min_profitable_estimate,
4533                                     unsigned *suggested_unroll_factor)
4534 {
4535   int min_profitable_iters;
4536   int min_profitable_estimate;
4537   int peel_iters_prologue;
4538   int peel_iters_epilogue;
4539   unsigned vec_inside_cost = 0;
4540   int vec_outside_cost = 0;
4541   unsigned vec_prologue_cost = 0;
4542   unsigned vec_epilogue_cost = 0;
4543   int scalar_single_iter_cost = 0;
4544   int scalar_outside_cost = 0;
4545   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4546   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4547   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4548
4549   /* Cost model disabled.  */
4550   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4551     {
4552       if (dump_enabled_p ())
4553         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4554       *ret_min_profitable_niters = 0;
4555       *ret_min_profitable_estimate = 0;
4556       return;
4557     }
4558
4559   /* Requires loop versioning tests to handle misalignment.  */
4560   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4561     {
4562       /*  FIXME: Make cost depend on complexity of individual check.  */
4563       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4564       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4565       if (dump_enabled_p ())
4566         dump_printf (MSG_NOTE,
4567                      "cost model: Adding cost of checks for loop "
4568                      "versioning to treat misalignment.\n");
4569     }
4570
4571   /* Requires loop versioning with alias checks.  */
4572   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4573     {
4574       /*  FIXME: Make cost depend on complexity of individual check.  */
4575       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4576       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4577       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4578       if (len)
4579         /* Count LEN - 1 ANDs and LEN comparisons.  */
4580         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4581                               scalar_stmt, vect_prologue);
4582       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4583       if (len)
4584         {
4585           /* Count LEN - 1 ANDs and LEN comparisons.  */
4586           unsigned int nstmts = len * 2 - 1;
4587           /* +1 for each bias that needs adding.  */
4588           for (unsigned int i = 0; i < len; ++i)
4589             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4590               nstmts += 1;
4591           (void) add_stmt_cost (target_cost_data, nstmts,
4592                                 scalar_stmt, vect_prologue);
4593         }
4594       if (dump_enabled_p ())
4595         dump_printf (MSG_NOTE,
4596                      "cost model: Adding cost of checks for loop "
4597                      "versioning aliasing.\n");
4598     }
4599
4600   /* Requires loop versioning with niter checks.  */
4601   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4602     {
4603       /*  FIXME: Make cost depend on complexity of individual check.  */
4604       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4605                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4606       if (dump_enabled_p ())
4607         dump_printf (MSG_NOTE,
4608                      "cost model: Adding cost of checks for loop "
4609                      "versioning niters.\n");
4610     }
4611
4612   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4613     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4614                           vect_prologue);
4615
4616   /* Count statements in scalar loop.  Using this as scalar cost for a single
4617      iteration for now.
4618
4619      TODO: Add outer loop support.
4620
4621      TODO: Consider assigning different costs to different scalar
4622      statements.  */
4623
4624   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4625
4626   /* Add additional cost for the peeled instructions in prologue and epilogue
4627      loop.  (For fully-masked loops there will be no peeling.)
4628
4629      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4630      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4631
4632      TODO: Build an expression that represents peel_iters for prologue and
4633      epilogue to be used in a run-time test.  */
4634
4635   bool prologue_need_br_taken_cost = false;
4636   bool prologue_need_br_not_taken_cost = false;
4637
4638   /* Calculate peel_iters_prologue.  */
4639   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4640     peel_iters_prologue = 0;
4641   else if (npeel < 0)
4642     {
4643       peel_iters_prologue = assumed_vf / 2;
4644       if (dump_enabled_p ())
4645         dump_printf (MSG_NOTE, "cost model: "
4646                      "prologue peel iters set to vf/2.\n");
4647
4648       /* If peeled iterations are unknown, count a taken branch and a not taken
4649          branch per peeled loop.  Even if scalar loop iterations are known,
4650          vector iterations are not known since peeled prologue iterations are
4651          not known.  Hence guards remain the same.  */
4652       prologue_need_br_taken_cost = true;
4653       prologue_need_br_not_taken_cost = true;
4654     }
4655   else
4656     {
4657       peel_iters_prologue = npeel;
4658       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4659         /* If peeled iterations are known but number of scalar loop
4660            iterations are unknown, count a taken branch per peeled loop.  */
4661         prologue_need_br_taken_cost = true;
4662     }
4663
4664   bool epilogue_need_br_taken_cost = false;
4665   bool epilogue_need_br_not_taken_cost = false;
4666
4667   /* Calculate peel_iters_epilogue.  */
4668   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4669     /* We need to peel exactly one iteration for gaps.  */
4670     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4671   else if (npeel < 0)
4672     {
4673       /* If peeling for alignment is unknown, loop bound of main loop
4674          becomes unknown.  */
4675       peel_iters_epilogue = assumed_vf / 2;
4676       if (dump_enabled_p ())
4677         dump_printf (MSG_NOTE, "cost model: "
4678                      "epilogue peel iters set to vf/2 because "
4679                      "peeling for alignment is unknown.\n");
4680
4681       /* See the same reason above in peel_iters_prologue calculation.  */
4682       epilogue_need_br_taken_cost = true;
4683       epilogue_need_br_not_taken_cost = true;
4684     }
4685   else
4686     {
4687       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4688       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4689         /* If peeled iterations are known but number of scalar loop
4690            iterations are unknown, count a taken branch per peeled loop.  */
4691         epilogue_need_br_taken_cost = true;
4692     }
4693
4694   stmt_info_for_cost *si;
4695   int j;
4696   /* Add costs associated with peel_iters_prologue.  */
4697   if (peel_iters_prologue)
4698     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4699       {
4700         (void) add_stmt_cost (target_cost_data,
4701                               si->count * peel_iters_prologue, si->kind,
4702                               si->stmt_info, si->node, si->vectype,
4703                               si->misalign, vect_prologue);
4704       }
4705
4706   /* Add costs associated with peel_iters_epilogue.  */
4707   if (peel_iters_epilogue)
4708     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4709       {
4710         (void) add_stmt_cost (target_cost_data,
4711                               si->count * peel_iters_epilogue, si->kind,
4712                               si->stmt_info, si->node, si->vectype,
4713                               si->misalign, vect_epilogue);
4714       }
4715
4716   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4717
4718   if (prologue_need_br_taken_cost)
4719     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4720                           vect_prologue);
4721
4722   if (prologue_need_br_not_taken_cost)
4723     (void) add_stmt_cost (target_cost_data, 1,
4724                           cond_branch_not_taken, vect_prologue);
4725
4726   if (epilogue_need_br_taken_cost)
4727     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4728                           vect_epilogue);
4729
4730   if (epilogue_need_br_not_taken_cost)
4731     (void) add_stmt_cost (target_cost_data, 1,
4732                           cond_branch_not_taken, vect_epilogue);
4733
4734   /* Take care of special costs for rgroup controls of partial vectors.  */
4735   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4736       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4737           == vect_partial_vectors_avx512))
4738     {
4739       /* Calculate how many masks we need to generate.  */
4740       unsigned int num_masks = 0;
4741       bool need_saturation = false;
4742       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4743         if (rgm.type)
4744           {
4745             unsigned nvectors = rgm.factor;
4746             num_masks += nvectors;
4747             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4748                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4749               need_saturation = true;
4750           }
4751
4752       /* ???  The target isn't able to identify the costs below as
4753          producing masks so it cannot penaltize cases where we'd run
4754          out of mask registers for example.  */
4755
4756       /* ???  We are also failing to account for smaller vector masks
4757          we generate by splitting larger masks in vect_get_loop_mask.  */
4758
4759       /* In the worst case, we need to generate each mask in the prologue
4760          and in the loop body.  We need one splat per group and one
4761          compare per mask.
4762
4763          Sometimes the prologue mask will fold to a constant,
4764          so the actual prologue cost might be smaller.  However, it's
4765          simpler and safer to use the worst-case cost; if this ends up
4766          being the tie-breaker between vectorizing or not, then it's
4767          probably better not to vectorize.  */
4768       (void) add_stmt_cost (target_cost_data,
4769                             num_masks
4770                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4771                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4772                             vect_prologue);
4773       (void) add_stmt_cost (target_cost_data,
4774                             num_masks
4775                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4776                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4777
4778       /* When we need saturation we need it both in the prologue and
4779          the epilogue.  */
4780       if (need_saturation)
4781         {
4782           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4783                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4784           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4785                                 NULL, NULL, NULL_TREE, 0, vect_body);
4786         }
4787     }
4788   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4789            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4790                == vect_partial_vectors_while_ult))
4791     {
4792       /* Calculate how many masks we need to generate.  */
4793       unsigned int num_masks = 0;
4794       rgroup_controls *rgm;
4795       unsigned int num_vectors_m1;
4796       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4797                         num_vectors_m1, rgm)
4798         if (rgm->type)
4799           num_masks += num_vectors_m1 + 1;
4800       gcc_assert (num_masks > 0);
4801
4802       /* In the worst case, we need to generate each mask in the prologue
4803          and in the loop body.  One of the loop body mask instructions
4804          replaces the comparison in the scalar loop, and since we don't
4805          count the scalar comparison against the scalar body, we shouldn't
4806          count that vector instruction against the vector body either.
4807
4808          Sometimes we can use unpacks instead of generating prologue
4809          masks and sometimes the prologue mask will fold to a constant,
4810          so the actual prologue cost might be smaller.  However, it's
4811          simpler and safer to use the worst-case cost; if this ends up
4812          being the tie-breaker between vectorizing or not, then it's
4813          probably better not to vectorize.  */
4814       (void) add_stmt_cost (target_cost_data, num_masks,
4815                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4816                             vect_prologue);
4817       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4818                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4819                             vect_body);
4820     }
4821   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4822     {
4823       /* Referring to the functions vect_set_loop_condition_partial_vectors
4824          and vect_set_loop_controls_directly, we need to generate each
4825          length in the prologue and in the loop body if required. Although
4826          there are some possible optimizations, we consider the worst case
4827          here.  */
4828
4829       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4830       signed char partial_load_store_bias
4831         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4832       bool need_iterate_p
4833         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4834            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4835
4836       /* Calculate how many statements to be added.  */
4837       unsigned int prologue_stmts = 0;
4838       unsigned int body_stmts = 0;
4839
4840       rgroup_controls *rgc;
4841       unsigned int num_vectors_m1;
4842       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4843         if (rgc->type)
4844           {
4845             /* May need one SHIFT for nitems_total computation.  */
4846             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4847             if (nitems != 1 && !niters_known_p)
4848               prologue_stmts += 1;
4849
4850             /* May need one MAX and one MINUS for wrap around.  */
4851             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4852               prologue_stmts += 2;
4853
4854             /* Need one MAX and one MINUS for each batch limit excepting for
4855                the 1st one.  */
4856             prologue_stmts += num_vectors_m1 * 2;
4857
4858             unsigned int num_vectors = num_vectors_m1 + 1;
4859
4860             /* Need to set up lengths in prologue, only one MIN required
4861                for each since start index is zero.  */
4862             prologue_stmts += num_vectors;
4863
4864             /* If we have a non-zero partial load bias, we need one PLUS
4865                to adjust the load length.  */
4866             if (partial_load_store_bias != 0)
4867               body_stmts += 1;
4868
4869             /* Each may need two MINs and one MINUS to update lengths in body
4870                for next iteration.  */
4871             if (need_iterate_p)
4872               body_stmts += 3 * num_vectors;
4873           }
4874
4875       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4876                             scalar_stmt, vect_prologue);
4877       (void) add_stmt_cost (target_cost_data, body_stmts,
4878                             scalar_stmt, vect_body);
4879     }
4880
4881   /* FORNOW: The scalar outside cost is incremented in one of the
4882      following ways:
4883
4884      1. The vectorizer checks for alignment and aliasing and generates
4885      a condition that allows dynamic vectorization.  A cost model
4886      check is ANDED with the versioning condition.  Hence scalar code
4887      path now has the added cost of the versioning check.
4888
4889        if (cost > th & versioning_check)
4890          jmp to vector code
4891
4892      Hence run-time scalar is incremented by not-taken branch cost.
4893
4894      2. The vectorizer then checks if a prologue is required.  If the
4895      cost model check was not done before during versioning, it has to
4896      be done before the prologue check.
4897
4898        if (cost <= th)
4899          prologue = scalar_iters
4900        if (prologue == 0)
4901          jmp to vector code
4902        else
4903          execute prologue
4904        if (prologue == num_iters)
4905          go to exit
4906
4907      Hence the run-time scalar cost is incremented by a taken branch,
4908      plus a not-taken branch, plus a taken branch cost.
4909
4910      3. The vectorizer then checks if an epilogue is required.  If the
4911      cost model check was not done before during prologue check, it
4912      has to be done with the epilogue check.
4913
4914        if (prologue == 0)
4915          jmp to vector code
4916        else
4917          execute prologue
4918        if (prologue == num_iters)
4919          go to exit
4920        vector code:
4921          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4922            jmp to epilogue
4923
4924      Hence the run-time scalar cost should be incremented by 2 taken
4925      branches.
4926
4927      TODO: The back end may reorder the BBS's differently and reverse
4928      conditions/branch directions.  Change the estimates below to
4929      something more reasonable.  */
4930
4931   /* If the number of iterations is known and we do not do versioning, we can
4932      decide whether to vectorize at compile time.  Hence the scalar version
4933      do not carry cost model guard costs.  */
4934   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4935       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4936     {
4937       /* Cost model check occurs at versioning.  */
4938       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4939         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4940       else
4941         {
4942           /* Cost model check occurs at prologue generation.  */
4943           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4944             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4945               + vect_get_stmt_cost (cond_branch_not_taken);
4946           /* Cost model check occurs at epilogue generation.  */
4947           else
4948             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4949         }
4950     }
4951
4952   /* Complete the target-specific cost calculations.  */
4953   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4954                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4955                suggested_unroll_factor);
4956
4957   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4958       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4959       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4960                     *suggested_unroll_factor,
4961                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4962     {
4963       if (dump_enabled_p ())
4964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4965                          "can't unroll as unrolled vectorization factor larger"
4966                          " than maximum vectorization factor: "
4967                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4968                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4969       *suggested_unroll_factor = 1;
4970     }
4971
4972   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4973
4974   if (dump_enabled_p ())
4975     {
4976       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4977       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4978                    vec_inside_cost);
4979       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4980                    vec_prologue_cost);
4981       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4982                    vec_epilogue_cost);
4983       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4984                    scalar_single_iter_cost);
4985       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4986                    scalar_outside_cost);
4987       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4988                    vec_outside_cost);
4989       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4990                    peel_iters_prologue);
4991       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4992                    peel_iters_epilogue);
4993     }
4994
4995   /* Calculate number of iterations required to make the vector version
4996      profitable, relative to the loop bodies only.  The following condition
4997      must hold true:
4998      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4999      where
5000      SIC = scalar iteration cost, VIC = vector iteration cost,
5001      VOC = vector outside cost, VF = vectorization factor,
5002      NPEEL = prologue iterations + epilogue iterations,
5003      SOC = scalar outside cost for run time cost model check.  */
5004
5005   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5006                           - vec_inside_cost);
5007   if (saving_per_viter <= 0)
5008     {
5009       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5010         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5011                     "vectorization did not happen for a simd loop");
5012
5013       if (dump_enabled_p ())
5014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015                          "cost model: the vector iteration cost = %d "
5016                          "divided by the scalar iteration cost = %d "
5017                          "is greater or equal to the vectorization factor = %d"
5018                          ".\n",
5019                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5020       *ret_min_profitable_niters = -1;
5021       *ret_min_profitable_estimate = -1;
5022       return;
5023     }
5024
5025   /* ??? The "if" arm is written to handle all cases; see below for what
5026      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5027   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5028     {
5029       /* Rewriting the condition above in terms of the number of
5030          vector iterations (vniters) rather than the number of
5031          scalar iterations (niters) gives:
5032
5033          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5034
5035          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5036
5037          For integer N, X and Y when X > 0:
5038
5039          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5040       int outside_overhead = (vec_outside_cost
5041                               - scalar_single_iter_cost * peel_iters_prologue
5042                               - scalar_single_iter_cost * peel_iters_epilogue
5043                               - scalar_outside_cost);
5044       /* We're only interested in cases that require at least one
5045          vector iteration.  */
5046       int min_vec_niters = 1;
5047       if (outside_overhead > 0)
5048         min_vec_niters = outside_overhead / saving_per_viter + 1;
5049
5050       if (dump_enabled_p ())
5051         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5052                      min_vec_niters);
5053
5054       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5055         {
5056           /* Now that we know the minimum number of vector iterations,
5057              find the minimum niters for which the scalar cost is larger:
5058
5059              SIC * niters > VIC * vniters + VOC - SOC
5060
5061              We know that the minimum niters is no more than
5062              vniters * VF + NPEEL, but it might be (and often is) less
5063              than that if a partial vector iteration is cheaper than the
5064              equivalent scalar code.  */
5065           int threshold = (vec_inside_cost * min_vec_niters
5066                            + vec_outside_cost
5067                            - scalar_outside_cost);
5068           if (threshold <= 0)
5069             min_profitable_iters = 1;
5070           else
5071             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5072         }
5073       else
5074         /* Convert the number of vector iterations into a number of
5075            scalar iterations.  */
5076         min_profitable_iters = (min_vec_niters * assumed_vf
5077                                 + peel_iters_prologue
5078                                 + peel_iters_epilogue);
5079     }
5080   else
5081     {
5082       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5083                               * assumed_vf
5084                               - vec_inside_cost * peel_iters_prologue
5085                               - vec_inside_cost * peel_iters_epilogue);
5086       if (min_profitable_iters <= 0)
5087         min_profitable_iters = 0;
5088       else
5089         {
5090           min_profitable_iters /= saving_per_viter;
5091
5092           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5093               <= (((int) vec_inside_cost * min_profitable_iters)
5094                   + (((int) vec_outside_cost - scalar_outside_cost)
5095                      * assumed_vf)))
5096             min_profitable_iters++;
5097         }
5098     }
5099
5100   if (dump_enabled_p ())
5101     dump_printf (MSG_NOTE,
5102                  "  Calculated minimum iters for profitability: %d\n",
5103                  min_profitable_iters);
5104
5105   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5106       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5107     /* We want the vectorized loop to execute at least once.  */
5108     min_profitable_iters = assumed_vf + peel_iters_prologue;
5109   else if (min_profitable_iters < peel_iters_prologue)
5110     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5111        vectorized loop executes at least once.  */
5112     min_profitable_iters = peel_iters_prologue;
5113
5114   if (dump_enabled_p ())
5115     dump_printf_loc (MSG_NOTE, vect_location,
5116                      "  Runtime profitability threshold = %d\n",
5117                      min_profitable_iters);
5118
5119   *ret_min_profitable_niters = min_profitable_iters;
5120
5121   /* Calculate number of iterations required to make the vector version
5122      profitable, relative to the loop bodies only.
5123
5124      Non-vectorized variant is SIC * niters and it must win over vector
5125      variant on the expected loop trip count.  The following condition must hold true:
5126      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5127
5128   if (vec_outside_cost <= 0)
5129     min_profitable_estimate = 0;
5130   /* ??? This "else if" arm is written to handle all cases; see below for
5131      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5132   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5133     {
5134       /* This is a repeat of the code above, but with + SOC rather
5135          than - SOC.  */
5136       int outside_overhead = (vec_outside_cost
5137                               - scalar_single_iter_cost * peel_iters_prologue
5138                               - scalar_single_iter_cost * peel_iters_epilogue
5139                               + scalar_outside_cost);
5140       int min_vec_niters = 1;
5141       if (outside_overhead > 0)
5142         min_vec_niters = outside_overhead / saving_per_viter + 1;
5143
5144       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5145         {
5146           int threshold = (vec_inside_cost * min_vec_niters
5147                            + vec_outside_cost
5148                            + scalar_outside_cost);
5149           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5150         }
5151       else
5152         min_profitable_estimate = (min_vec_niters * assumed_vf
5153                                    + peel_iters_prologue
5154                                    + peel_iters_epilogue);
5155     }
5156   else
5157     {
5158       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5159                                  * assumed_vf
5160                                  - vec_inside_cost * peel_iters_prologue
5161                                  - vec_inside_cost * peel_iters_epilogue)
5162                                  / ((scalar_single_iter_cost * assumed_vf)
5163                                    - vec_inside_cost);
5164     }
5165   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5166   if (dump_enabled_p ())
5167     dump_printf_loc (MSG_NOTE, vect_location,
5168                      "  Static estimate profitability threshold = %d\n",
5169                      min_profitable_estimate);
5170
5171   *ret_min_profitable_estimate = min_profitable_estimate;
5172 }
5173
5174 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5175    vector elements (not bits) for a vector with NELT elements.  */
5176 static void
5177 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5178                               vec_perm_builder *sel)
5179 {
5180   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5181      by vec_perm_indices.  */
5182   sel->new_vector (nelt, 1, 3);
5183   for (unsigned int i = 0; i < 3; i++)
5184     sel->quick_push (i + offset);
5185 }
5186
5187 /* Checks whether the target supports whole-vector shifts for vectors of mode
5188    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5189    it supports vec_perm_const with masks for all necessary shift amounts.  */
5190 static bool
5191 have_whole_vector_shift (machine_mode mode)
5192 {
5193   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5194     return true;
5195
5196   /* Variable-length vectors should be handled via the optab.  */
5197   unsigned int nelt;
5198   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5199     return false;
5200
5201   vec_perm_builder sel;
5202   vec_perm_indices indices;
5203   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5204     {
5205       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5206       indices.new_vector (sel, 2, nelt);
5207       if (!can_vec_perm_const_p (mode, mode, indices, false))
5208         return false;
5209     }
5210   return true;
5211 }
5212
5213 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5214    multiplication operands have differing signs and (b) we intend
5215    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5216    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5217
5218 static bool
5219 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5220                                  stmt_vec_info stmt_info)
5221 {
5222   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5223   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5224     return false;
5225
5226   tree rhs1 = gimple_assign_rhs1 (assign);
5227   tree rhs2 = gimple_assign_rhs2 (assign);
5228   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5229     return false;
5230
5231   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5232   gcc_assert (reduc_info->is_reduc_info);
5233   return !directly_supported_p (DOT_PROD_EXPR,
5234                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5235                                 optab_vector_mixed_sign);
5236 }
5237
5238 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5239    functions. Design better to avoid maintenance issues.  */
5240
5241 /* Function vect_model_reduction_cost.
5242
5243    Models cost for a reduction operation, including the vector ops
5244    generated within the strip-mine loop in some cases, the initial
5245    definition before the loop, and the epilogue code that must be generated.  */
5246
5247 static void
5248 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5249                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5250                            vect_reduction_type reduction_type,
5251                            int ncopies, stmt_vector_for_cost *cost_vec)
5252 {
5253   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5254   tree vectype;
5255   machine_mode mode;
5256   class loop *loop = NULL;
5257
5258   if (loop_vinfo)
5259     loop = LOOP_VINFO_LOOP (loop_vinfo);
5260
5261   /* Condition reductions generate two reductions in the loop.  */
5262   if (reduction_type == COND_REDUCTION)
5263     ncopies *= 2;
5264
5265   vectype = STMT_VINFO_VECTYPE (stmt_info);
5266   mode = TYPE_MODE (vectype);
5267   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5268
5269   gimple_match_op op;
5270   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5271     gcc_unreachable ();
5272
5273   bool emulated_mixed_dot_prod
5274     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5275   if (reduction_type == EXTRACT_LAST_REDUCTION)
5276     /* No extra instructions are needed in the prologue.  The loop body
5277        operations are costed in vectorizable_condition.  */
5278     inside_cost = 0;
5279   else if (reduction_type == FOLD_LEFT_REDUCTION)
5280     {
5281       /* No extra instructions needed in the prologue.  */
5282       prologue_cost = 0;
5283
5284       if (reduc_fn != IFN_LAST)
5285         /* Count one reduction-like operation per vector.  */
5286         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5287                                         stmt_info, 0, vect_body);
5288       else
5289         {
5290           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5291           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5292           inside_cost = record_stmt_cost (cost_vec, nelements,
5293                                           vec_to_scalar, stmt_info, 0,
5294                                           vect_body);
5295           inside_cost += record_stmt_cost (cost_vec, nelements,
5296                                            scalar_stmt, stmt_info, 0,
5297                                            vect_body);
5298         }
5299     }
5300   else
5301     {
5302       /* Add in the cost of the initial definitions.  */
5303       int prologue_stmts;
5304       if (reduction_type == COND_REDUCTION)
5305         /* For cond reductions we have four vectors: initial index, step,
5306            initial result of the data reduction, initial value of the index
5307            reduction.  */
5308         prologue_stmts = 4;
5309       else if (emulated_mixed_dot_prod)
5310         /* We need the initial reduction value and two invariants:
5311            one that contains the minimum signed value and one that
5312            contains half of its negative.  */
5313         prologue_stmts = 3;
5314       else
5315         prologue_stmts = 1;
5316       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5317                                          scalar_to_vec, stmt_info, 0,
5318                                          vect_prologue);
5319     }
5320
5321   /* Determine cost of epilogue code.
5322
5323      We have a reduction operator that will reduce the vector in one statement.
5324      Also requires scalar extract.  */
5325
5326   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5327     {
5328       if (reduc_fn != IFN_LAST)
5329         {
5330           if (reduction_type == COND_REDUCTION)
5331             {
5332               /* An EQ stmt and an COND_EXPR stmt.  */
5333               epilogue_cost += record_stmt_cost (cost_vec, 2,
5334                                                  vector_stmt, stmt_info, 0,
5335                                                  vect_epilogue);
5336               /* Reduction of the max index and a reduction of the found
5337                  values.  */
5338               epilogue_cost += record_stmt_cost (cost_vec, 2,
5339                                                  vec_to_scalar, stmt_info, 0,
5340                                                  vect_epilogue);
5341               /* A broadcast of the max value.  */
5342               epilogue_cost += record_stmt_cost (cost_vec, 1,
5343                                                  scalar_to_vec, stmt_info, 0,
5344                                                  vect_epilogue);
5345             }
5346           else
5347             {
5348               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5349                                                  stmt_info, 0, vect_epilogue);
5350               epilogue_cost += record_stmt_cost (cost_vec, 1,
5351                                                  vec_to_scalar, stmt_info, 0,
5352                                                  vect_epilogue);
5353             }
5354         }
5355       else if (reduction_type == COND_REDUCTION)
5356         {
5357           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5358           /* Extraction of scalar elements.  */
5359           epilogue_cost += record_stmt_cost (cost_vec,
5360                                              2 * estimated_nunits,
5361                                              vec_to_scalar, stmt_info, 0,
5362                                              vect_epilogue);
5363           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5364           epilogue_cost += record_stmt_cost (cost_vec,
5365                                              2 * estimated_nunits - 3,
5366                                              scalar_stmt, stmt_info, 0,
5367                                              vect_epilogue);
5368         }
5369       else if (reduction_type == EXTRACT_LAST_REDUCTION
5370                || reduction_type == FOLD_LEFT_REDUCTION)
5371         /* No extra instructions need in the epilogue.  */
5372         ;
5373       else
5374         {
5375           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5376           tree bitsize = TYPE_SIZE (op.type);
5377           int element_bitsize = tree_to_uhwi (bitsize);
5378           int nelements = vec_size_in_bits / element_bitsize;
5379
5380           if (op.code == COND_EXPR)
5381             op.code = MAX_EXPR;
5382
5383           /* We have a whole vector shift available.  */
5384           if (VECTOR_MODE_P (mode)
5385               && directly_supported_p (op.code, vectype)
5386               && have_whole_vector_shift (mode))
5387             {
5388               /* Final reduction via vector shifts and the reduction operator.
5389                  Also requires scalar extract.  */
5390               epilogue_cost += record_stmt_cost (cost_vec,
5391                                                  exact_log2 (nelements) * 2,
5392                                                  vector_stmt, stmt_info, 0,
5393                                                  vect_epilogue);
5394               epilogue_cost += record_stmt_cost (cost_vec, 1,
5395                                                  vec_to_scalar, stmt_info, 0,
5396                                                  vect_epilogue);
5397             }
5398           else
5399             /* Use extracts and reduction op for final reduction.  For N
5400                elements, we have N extracts and N-1 reduction ops.  */
5401             epilogue_cost += record_stmt_cost (cost_vec,
5402                                                nelements + nelements - 1,
5403                                                vector_stmt, stmt_info, 0,
5404                                                vect_epilogue);
5405         }
5406     }
5407
5408   if (dump_enabled_p ())
5409     dump_printf (MSG_NOTE,
5410                  "vect_model_reduction_cost: inside_cost = %d, "
5411                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5412                  prologue_cost, epilogue_cost);
5413 }
5414
5415 /* SEQ is a sequence of instructions that initialize the reduction
5416    described by REDUC_INFO.  Emit them in the appropriate place.  */
5417
5418 static void
5419 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5420                                 stmt_vec_info reduc_info, gimple *seq)
5421 {
5422   if (reduc_info->reused_accumulator)
5423     {
5424       /* When reusing an accumulator from the main loop, we only need
5425          initialization instructions if the main loop can be skipped.
5426          In that case, emit the initialization instructions at the end
5427          of the guard block that does the skip.  */
5428       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5429       gcc_assert (skip_edge);
5430       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5431       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5432     }
5433   else
5434     {
5435       /* The normal case: emit the initialization instructions on the
5436          preheader edge.  */
5437       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5438       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5439     }
5440 }
5441
5442 /* Function get_initial_def_for_reduction
5443
5444    Input:
5445    REDUC_INFO - the info_for_reduction
5446    INIT_VAL - the initial value of the reduction variable
5447    NEUTRAL_OP - a value that has no effect on the reduction, as per
5448                 neutral_op_for_reduction
5449
5450    Output:
5451    Return a vector variable, initialized according to the operation that
5452         STMT_VINFO performs. This vector will be used as the initial value
5453         of the vector of partial results.
5454
5455    The value we need is a vector in which element 0 has value INIT_VAL
5456    and every other element has value NEUTRAL_OP.  */
5457
5458 static tree
5459 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5460                                stmt_vec_info reduc_info,
5461                                tree init_val, tree neutral_op)
5462 {
5463   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5464   tree scalar_type = TREE_TYPE (init_val);
5465   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5466   tree init_def;
5467   gimple_seq stmts = NULL;
5468
5469   gcc_assert (vectype);
5470
5471   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5472               || SCALAR_FLOAT_TYPE_P (scalar_type));
5473
5474   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5475               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5476
5477   if (operand_equal_p (init_val, neutral_op))
5478     {
5479       /* If both elements are equal then the vector described above is
5480          just a splat.  */
5481       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5482       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5483     }
5484   else
5485     {
5486       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5487       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5488       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5489         {
5490           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5491              element 0.  */
5492           init_def = gimple_build_vector_from_val (&stmts, vectype,
5493                                                    neutral_op);
5494           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5495                                    vectype, init_def, init_val);
5496         }
5497       else
5498         {
5499           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5500           tree_vector_builder elts (vectype, 1, 2);
5501           elts.quick_push (init_val);
5502           elts.quick_push (neutral_op);
5503           init_def = gimple_build_vector (&stmts, &elts);
5504         }
5505     }
5506
5507   if (stmts)
5508     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5509   return init_def;
5510 }
5511
5512 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5513    which performs a reduction involving GROUP_SIZE scalar statements.
5514    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5515    is nonnull, introducing extra elements of that value will not change the
5516    result.  */
5517
5518 static void
5519 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5520                                 stmt_vec_info reduc_info,
5521                                 vec<tree> *vec_oprnds,
5522                                 unsigned int number_of_vectors,
5523                                 unsigned int group_size, tree neutral_op)
5524 {
5525   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5526   unsigned HOST_WIDE_INT nunits;
5527   unsigned j, number_of_places_left_in_vector;
5528   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5529   unsigned int i;
5530
5531   gcc_assert (group_size == initial_values.length () || neutral_op);
5532
5533   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5534      created vectors. It is greater than 1 if unrolling is performed.
5535
5536      For example, we have two scalar operands, s1 and s2 (e.g., group of
5537      strided accesses of size two), while NUNITS is four (i.e., four scalars
5538      of this type can be packed in a vector).  The output vector will contain
5539      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5540      will be 2).
5541
5542      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5543      vectors containing the operands.
5544
5545      For example, NUNITS is four as before, and the group size is 8
5546      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5547      {s5, s6, s7, s8}.  */
5548
5549   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5550     nunits = group_size;
5551
5552   number_of_places_left_in_vector = nunits;
5553   bool constant_p = true;
5554   tree_vector_builder elts (vector_type, nunits, 1);
5555   elts.quick_grow (nunits);
5556   gimple_seq ctor_seq = NULL;
5557   for (j = 0; j < nunits * number_of_vectors; ++j)
5558     {
5559       tree op;
5560       i = j % group_size;
5561
5562       /* Get the def before the loop.  In reduction chain we have only
5563          one initial value.  Else we have as many as PHIs in the group.  */
5564       if (i >= initial_values.length () || (j > i && neutral_op))
5565         op = neutral_op;
5566       else
5567         op = initial_values[i];
5568
5569       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5570       number_of_places_left_in_vector--;
5571       elts[nunits - number_of_places_left_in_vector - 1] = op;
5572       if (!CONSTANT_CLASS_P (op))
5573         constant_p = false;
5574
5575       if (number_of_places_left_in_vector == 0)
5576         {
5577           tree init;
5578           if (constant_p && !neutral_op
5579               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5580               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5581             /* Build the vector directly from ELTS.  */
5582             init = gimple_build_vector (&ctor_seq, &elts);
5583           else if (neutral_op)
5584             {
5585               /* Build a vector of the neutral value and shift the
5586                  other elements into place.  */
5587               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5588                                                    neutral_op);
5589               int k = nunits;
5590               while (k > 0 && elts[k - 1] == neutral_op)
5591                 k -= 1;
5592               while (k > 0)
5593                 {
5594                   k -= 1;
5595                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5596                                        vector_type, init, elts[k]);
5597                 }
5598             }
5599           else
5600             {
5601               /* First time round, duplicate ELTS to fill the
5602                  required number of vectors.  */
5603               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5604                                         elts, number_of_vectors, *vec_oprnds);
5605               break;
5606             }
5607           vec_oprnds->quick_push (init);
5608
5609           number_of_places_left_in_vector = nunits;
5610           elts.new_vector (vector_type, nunits, 1);
5611           elts.quick_grow (nunits);
5612           constant_p = true;
5613         }
5614     }
5615   if (ctor_seq != NULL)
5616     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5617 }
5618
5619 /* For a statement STMT_INFO taking part in a reduction operation return
5620    the stmt_vec_info the meta information is stored on.  */
5621
5622 stmt_vec_info
5623 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5624 {
5625   stmt_info = vect_orig_stmt (stmt_info);
5626   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5627   if (!is_a <gphi *> (stmt_info->stmt)
5628       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5629     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5630   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5631   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5632     {
5633       if (gimple_phi_num_args (phi) == 1)
5634         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5635     }
5636   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5637     {
5638       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5639       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5640         stmt_info = info;
5641     }
5642   return stmt_info;
5643 }
5644
5645 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5646    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5647    return false.  */
5648
5649 static bool
5650 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5651                                 stmt_vec_info reduc_info)
5652 {
5653   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5654   if (!main_loop_vinfo)
5655     return false;
5656
5657   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5658     return false;
5659
5660   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5661   auto_vec<tree, 16> main_loop_results (num_phis);
5662   auto_vec<tree, 16> initial_values (num_phis);
5663   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5664     {
5665       /* The epilogue loop can be entered either from the main loop or
5666          from an earlier guard block.  */
5667       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5668       for (tree incoming_value : reduc_info->reduc_initial_values)
5669         {
5670           /* Look for:
5671
5672                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5673                                     INITIAL_VALUE(guard block)>.  */
5674           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5675
5676           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5677           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5678
5679           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5680           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5681
5682           main_loop_results.quick_push (from_main_loop);
5683           initial_values.quick_push (from_skip);
5684         }
5685     }
5686   else
5687     /* The main loop dominates the epilogue loop.  */
5688     main_loop_results.splice (reduc_info->reduc_initial_values);
5689
5690   /* See if the main loop has the kind of accumulator we need.  */
5691   vect_reusable_accumulator *accumulator
5692     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5693   if (!accumulator
5694       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5695       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5696                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5697     return false;
5698
5699   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5700   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5701   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5702   unsigned HOST_WIDE_INT m;
5703   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5704                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5705     return false;
5706   /* Check the intermediate vector types and operations are available.  */
5707   tree prev_vectype = old_vectype;
5708   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5709   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5710     {
5711       intermediate_nunits = exact_div (intermediate_nunits, 2);
5712       tree intermediate_vectype = get_related_vectype_for_scalar_type
5713         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5714       if (!intermediate_vectype
5715           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5716                                     intermediate_vectype)
5717           || !can_vec_extract (TYPE_MODE (prev_vectype),
5718                                TYPE_MODE (intermediate_vectype)))
5719         return false;
5720       prev_vectype = intermediate_vectype;
5721     }
5722
5723   /* Non-SLP reductions might apply an adjustment after the reduction
5724      operation, in order to simplify the initialization of the accumulator.
5725      If the epilogue loop carries on from where the main loop left off,
5726      it should apply the same adjustment to the final reduction result.
5727
5728      If the epilogue loop can also be entered directly (rather than via
5729      the main loop), we need to be able to handle that case in the same way,
5730      with the same adjustment.  (In principle we could add a PHI node
5731      to select the correct adjustment, but in practice that shouldn't be
5732      necessary.)  */
5733   tree main_adjustment
5734     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5735   if (loop_vinfo->main_loop_edge && main_adjustment)
5736     {
5737       gcc_assert (num_phis == 1);
5738       tree initial_value = initial_values[0];
5739       /* Check that we can use INITIAL_VALUE as the adjustment and
5740          initialize the accumulator with a neutral value instead.  */
5741       if (!operand_equal_p (initial_value, main_adjustment))
5742         return false;
5743       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5744       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5745                                                     code, initial_value);
5746     }
5747   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5748   reduc_info->reduc_initial_values.truncate (0);
5749   reduc_info->reduc_initial_values.splice (initial_values);
5750   reduc_info->reused_accumulator = accumulator;
5751   return true;
5752 }
5753
5754 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5755    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5756
5757 static tree
5758 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5759                             gimple_seq *seq)
5760 {
5761   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5762   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5763   tree stype = TREE_TYPE (vectype);
5764   tree new_temp = vec_def;
5765   while (nunits > nunits1)
5766     {
5767       nunits /= 2;
5768       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5769                                                            stype, nunits);
5770       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5771
5772       /* The target has to make sure we support lowpart/highpart
5773          extraction, either via direct vector extract or through
5774          an integer mode punning.  */
5775       tree dst1, dst2;
5776       gimple *epilog_stmt;
5777       if (convert_optab_handler (vec_extract_optab,
5778                                  TYPE_MODE (TREE_TYPE (new_temp)),
5779                                  TYPE_MODE (vectype1))
5780           != CODE_FOR_nothing)
5781         {
5782           /* Extract sub-vectors directly once vec_extract becomes
5783              a conversion optab.  */
5784           dst1 = make_ssa_name (vectype1);
5785           epilog_stmt
5786               = gimple_build_assign (dst1, BIT_FIELD_REF,
5787                                      build3 (BIT_FIELD_REF, vectype1,
5788                                              new_temp, TYPE_SIZE (vectype1),
5789                                              bitsize_int (0)));
5790           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5791           dst2 =  make_ssa_name (vectype1);
5792           epilog_stmt
5793               = gimple_build_assign (dst2, BIT_FIELD_REF,
5794                                      build3 (BIT_FIELD_REF, vectype1,
5795                                              new_temp, TYPE_SIZE (vectype1),
5796                                              bitsize_int (bitsize)));
5797           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5798         }
5799       else
5800         {
5801           /* Extract via punning to appropriately sized integer mode
5802              vector.  */
5803           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5804           tree etype = build_vector_type (eltype, 2);
5805           gcc_assert (convert_optab_handler (vec_extract_optab,
5806                                              TYPE_MODE (etype),
5807                                              TYPE_MODE (eltype))
5808                       != CODE_FOR_nothing);
5809           tree tem = make_ssa_name (etype);
5810           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5811                                              build1 (VIEW_CONVERT_EXPR,
5812                                                      etype, new_temp));
5813           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5814           new_temp = tem;
5815           tem = make_ssa_name (eltype);
5816           epilog_stmt
5817               = gimple_build_assign (tem, BIT_FIELD_REF,
5818                                      build3 (BIT_FIELD_REF, eltype,
5819                                              new_temp, TYPE_SIZE (eltype),
5820                                              bitsize_int (0)));
5821           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5822           dst1 = make_ssa_name (vectype1);
5823           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5824                                              build1 (VIEW_CONVERT_EXPR,
5825                                                      vectype1, tem));
5826           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5827           tem = make_ssa_name (eltype);
5828           epilog_stmt
5829               = gimple_build_assign (tem, BIT_FIELD_REF,
5830                                      build3 (BIT_FIELD_REF, eltype,
5831                                              new_temp, TYPE_SIZE (eltype),
5832                                              bitsize_int (bitsize)));
5833           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5834           dst2 =  make_ssa_name (vectype1);
5835           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5836                                              build1 (VIEW_CONVERT_EXPR,
5837                                                      vectype1, tem));
5838           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5839         }
5840
5841       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5842     }
5843
5844   return new_temp;
5845 }
5846
5847 /* Function vect_create_epilog_for_reduction
5848
5849    Create code at the loop-epilog to finalize the result of a reduction
5850    computation.
5851
5852    STMT_INFO is the scalar reduction stmt that is being vectorized.
5853    SLP_NODE is an SLP node containing a group of reduction statements. The
5854      first one in this group is STMT_INFO.
5855    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5856    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5857      (counting from 0)
5858
5859    This function:
5860    1. Completes the reduction def-use cycles.
5861    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5862       by calling the function specified by REDUC_FN if available, or by
5863       other means (whole-vector shifts or a scalar loop).
5864       The function also creates a new phi node at the loop exit to preserve
5865       loop-closed form, as illustrated below.
5866
5867      The flow at the entry to this function:
5868
5869         loop:
5870           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5871           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5872           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5873         loop_exit:
5874           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5875           use <s_out0>
5876           use <s_out0>
5877
5878      The above is transformed by this function into:
5879
5880         loop:
5881           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5882           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5883           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5884         loop_exit:
5885           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5886           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5887           v_out2 = reduce <v_out1>
5888           s_out3 = extract_field <v_out2, 0>
5889           s_out4 = adjust_result <s_out3>
5890           use <s_out4>
5891           use <s_out4>
5892 */
5893
5894 static void
5895 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5896                                   stmt_vec_info stmt_info,
5897                                   slp_tree slp_node,
5898                                   slp_instance slp_node_instance)
5899 {
5900   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5901   gcc_assert (reduc_info->is_reduc_info);
5902   /* For double reductions we need to get at the inner loop reduction
5903      stmt which has the meta info attached.  Our stmt_info is that of the
5904      loop-closed PHI of the inner loop which we remember as
5905      def for the reduction PHI generation.  */
5906   bool double_reduc = false;
5907   stmt_vec_info rdef_info = stmt_info;
5908   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5909     {
5910       gcc_assert (!slp_node);
5911       double_reduc = true;
5912       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5913                                             (stmt_info->stmt, 0));
5914       stmt_info = vect_stmt_to_vectorize (stmt_info);
5915     }
5916   gphi *reduc_def_stmt
5917     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5918   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5919   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5920   tree vectype;
5921   machine_mode mode;
5922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5923   basic_block exit_bb;
5924   tree scalar_dest;
5925   tree scalar_type;
5926   gimple *new_phi = NULL, *phi = NULL;
5927   gimple_stmt_iterator exit_gsi;
5928   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5929   gimple *epilog_stmt = NULL;
5930   gimple *exit_phi;
5931   tree bitsize;
5932   tree def;
5933   tree orig_name, scalar_result;
5934   imm_use_iterator imm_iter, phi_imm_iter;
5935   use_operand_p use_p, phi_use_p;
5936   gimple *use_stmt;
5937   auto_vec<tree> reduc_inputs;
5938   int j, i;
5939   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5940   unsigned int group_size = 1, k;
5941   auto_vec<gimple *> phis;
5942   /* SLP reduction without reduction chain, e.g.,
5943      # a1 = phi <a2, a0>
5944      # b1 = phi <b2, b0>
5945      a2 = operation (a1)
5946      b2 = operation (b1)  */
5947   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5948   bool direct_slp_reduc;
5949   tree induction_index = NULL_TREE;
5950
5951   if (slp_node)
5952     group_size = SLP_TREE_LANES (slp_node);
5953
5954   if (nested_in_vect_loop_p (loop, stmt_info))
5955     {
5956       outer_loop = loop;
5957       loop = loop->inner;
5958       gcc_assert (!slp_node && double_reduc);
5959     }
5960
5961   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5962   gcc_assert (vectype);
5963   mode = TYPE_MODE (vectype);
5964
5965   tree induc_val = NULL_TREE;
5966   tree adjustment_def = NULL;
5967   if (slp_node)
5968     ;
5969   else
5970     {
5971       /* Optimize: for induction condition reduction, if we can't use zero
5972          for induc_val, use initial_def.  */
5973       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5974         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5975       else if (double_reduc)
5976         ;
5977       else
5978         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5979     }
5980
5981   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5982   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5983   if (slp_reduc)
5984     /* All statements produce live-out values.  */
5985     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5986   else if (slp_node)
5987     {
5988       /* The last statement in the reduction chain produces the live-out
5989          value.  Note SLP optimization can shuffle scalar stmts to
5990          optimize permutations so we have to search for the last stmt.  */
5991       for (k = 0; k < group_size; ++k)
5992         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5993           {
5994             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5995             break;
5996           }
5997     }
5998
5999   unsigned vec_num;
6000   int ncopies;
6001   if (slp_node)
6002     {
6003       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6004       ncopies = 1;
6005     }
6006   else
6007     {
6008       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6009       vec_num = 1;
6010       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6011     }
6012
6013   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6014      which is updated with the current index of the loop for every match of
6015      the original loop's cond_expr (VEC_STMT).  This results in a vector
6016      containing the last time the condition passed for that vector lane.
6017      The first match will be a 1 to allow 0 to be used for non-matching
6018      indexes.  If there are no matches at all then the vector will be all
6019      zeroes.
6020
6021      PR92772: This algorithm is broken for architectures that support
6022      masked vectors, but do not provide fold_extract_last.  */
6023   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6024     {
6025       auto_vec<std::pair<tree, bool>, 2> ccompares;
6026       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6027       cond_info = vect_stmt_to_vectorize (cond_info);
6028       while (cond_info != reduc_info)
6029         {
6030           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6031             {
6032               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6033               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6034               ccompares.safe_push
6035                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6036                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6037             }
6038           cond_info
6039             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6040                                                  1 + STMT_VINFO_REDUC_IDX
6041                                                         (cond_info)));
6042           cond_info = vect_stmt_to_vectorize (cond_info);
6043         }
6044       gcc_assert (ccompares.length () != 0);
6045
6046       tree indx_before_incr, indx_after_incr;
6047       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6048       int scalar_precision
6049         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6050       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6051       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6052         (TYPE_MODE (vectype), cr_index_scalar_type,
6053          TYPE_VECTOR_SUBPARTS (vectype));
6054
6055       /* First we create a simple vector induction variable which starts
6056          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6057          vector size (STEP).  */
6058
6059       /* Create a {1,2,3,...} vector.  */
6060       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6061
6062       /* Create a vector of the step value.  */
6063       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6064       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6065
6066       /* Create an induction variable.  */
6067       gimple_stmt_iterator incr_gsi;
6068       bool insert_after;
6069       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6070       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6071                  insert_after, &indx_before_incr, &indx_after_incr);
6072
6073       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6074          filled with zeros (VEC_ZERO).  */
6075
6076       /* Create a vector of 0s.  */
6077       tree zero = build_zero_cst (cr_index_scalar_type);
6078       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6079
6080       /* Create a vector phi node.  */
6081       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6082       new_phi = create_phi_node (new_phi_tree, loop->header);
6083       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6084                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6085
6086       /* Now take the condition from the loops original cond_exprs
6087          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6088          every match uses values from the induction variable
6089          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6090          (NEW_PHI_TREE).
6091          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6092          the new cond_expr (INDEX_COND_EXPR).  */
6093       gimple_seq stmts = NULL;
6094       for (int i = ccompares.length () - 1; i != -1; --i)
6095         {
6096           tree ccompare = ccompares[i].first;
6097           if (ccompares[i].second)
6098             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6099                                          cr_index_vector_type,
6100                                          ccompare,
6101                                          indx_before_incr, new_phi_tree);
6102           else
6103             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6104                                          cr_index_vector_type,
6105                                          ccompare,
6106                                          new_phi_tree, indx_before_incr);
6107         }
6108       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6109
6110       /* Update the phi with the vec cond.  */
6111       induction_index = new_phi_tree;
6112       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6113                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6114     }
6115
6116   /* 2. Create epilog code.
6117         The reduction epilog code operates across the elements of the vector
6118         of partial results computed by the vectorized loop.
6119         The reduction epilog code consists of:
6120
6121         step 1: compute the scalar result in a vector (v_out2)
6122         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6123         step 3: adjust the scalar result (s_out3) if needed.
6124
6125         Step 1 can be accomplished using one the following three schemes:
6126           (scheme 1) using reduc_fn, if available.
6127           (scheme 2) using whole-vector shifts, if available.
6128           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6129                      combined.
6130
6131           The overall epilog code looks like this:
6132
6133           s_out0 = phi <s_loop>         # original EXIT_PHI
6134           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6135           v_out2 = reduce <v_out1>              # step 1
6136           s_out3 = extract_field <v_out2, 0>    # step 2
6137           s_out4 = adjust_result <s_out3>       # step 3
6138
6139           (step 3 is optional, and steps 1 and 2 may be combined).
6140           Lastly, the uses of s_out0 are replaced by s_out4.  */
6141
6142
6143   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6144          v_out1 = phi <VECT_DEF>
6145          Store them in NEW_PHIS.  */
6146   if (double_reduc)
6147     loop = outer_loop;
6148   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6149   exit_gsi = gsi_after_labels (exit_bb);
6150   reduc_inputs.create (slp_node ? vec_num : ncopies);
6151   for (unsigned i = 0; i < vec_num; i++)
6152     {
6153       gimple_seq stmts = NULL;
6154       if (slp_node)
6155         def = vect_get_slp_vect_def (slp_node, i);
6156       else
6157         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6158       for (j = 0; j < ncopies; j++)
6159         {
6160           tree new_def = copy_ssa_name (def);
6161           phi = create_phi_node (new_def, exit_bb);
6162           if (j)
6163             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6164           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6165           new_def = gimple_convert (&stmts, vectype, new_def);
6166           reduc_inputs.quick_push (new_def);
6167         }
6168       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6169     }
6170
6171   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6172          (i.e. when reduc_fn is not available) and in the final adjustment
6173          code (if needed).  Also get the original scalar reduction variable as
6174          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6175          represents a reduction pattern), the tree-code and scalar-def are
6176          taken from the original stmt that the pattern-stmt (STMT) replaces.
6177          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6178          are taken from STMT.  */
6179
6180   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6181   if (orig_stmt_info != stmt_info)
6182     {
6183       /* Reduction pattern  */
6184       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6185       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6186     }
6187
6188   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6189   scalar_type = TREE_TYPE (scalar_dest);
6190   scalar_results.truncate (0);
6191   scalar_results.reserve_exact (group_size);
6192   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6193   bitsize = TYPE_SIZE (scalar_type);
6194
6195   /* True if we should implement SLP_REDUC using native reduction operations
6196      instead of scalar operations.  */
6197   direct_slp_reduc = (reduc_fn != IFN_LAST
6198                       && slp_reduc
6199                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6200
6201   /* In case of reduction chain, e.g.,
6202      # a1 = phi <a3, a0>
6203      a2 = operation (a1)
6204      a3 = operation (a2),
6205
6206      we may end up with more than one vector result.  Here we reduce them
6207      to one vector.
6208
6209      The same is true for a SLP reduction, e.g.,
6210      # a1 = phi <a2, a0>
6211      # b1 = phi <b2, b0>
6212      a2 = operation (a1)
6213      b2 = operation (a2),
6214
6215      where we can end up with more than one vector as well.  We can
6216      easily accumulate vectors when the number of vector elements is
6217      a multiple of the SLP group size.
6218
6219      The same is true if we couldn't use a single defuse cycle.  */
6220   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6221       || direct_slp_reduc
6222       || (slp_reduc
6223           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6224       || ncopies > 1)
6225     {
6226       gimple_seq stmts = NULL;
6227       tree single_input = reduc_inputs[0];
6228       for (k = 1; k < reduc_inputs.length (); k++)
6229         single_input = gimple_build (&stmts, code, vectype,
6230                                      single_input, reduc_inputs[k]);
6231       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6232
6233       reduc_inputs.truncate (0);
6234       reduc_inputs.safe_push (single_input);
6235     }
6236
6237   tree orig_reduc_input = reduc_inputs[0];
6238
6239   /* If this loop is an epilogue loop that can be skipped after the
6240      main loop, we can only share a reduction operation between the
6241      main loop and the epilogue if we put it at the target of the
6242      skip edge.
6243
6244      We can still reuse accumulators if this check fails.  Doing so has
6245      the minor(?) benefit of making the epilogue loop's scalar result
6246      independent of the main loop's scalar result.  */
6247   bool unify_with_main_loop_p = false;
6248   if (reduc_info->reused_accumulator
6249       && loop_vinfo->skip_this_loop_edge
6250       && single_succ_p (exit_bb)
6251       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6252     {
6253       unify_with_main_loop_p = true;
6254
6255       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6256       reduc_inputs[0] = make_ssa_name (vectype);
6257       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6258       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6259                    UNKNOWN_LOCATION);
6260       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6261                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6262       exit_gsi = gsi_after_labels (reduc_block);
6263     }
6264
6265   /* Shouldn't be used beyond this point.  */
6266   exit_bb = nullptr;
6267
6268   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6269       && reduc_fn != IFN_LAST)
6270     {
6271       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6272          various data values where the condition matched and another vector
6273          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6274          need to extract the last matching index (which will be the index with
6275          highest value) and use this to index into the data vector.
6276          For the case where there were no matches, the data vector will contain
6277          all default values and the index vector will be all zeros.  */
6278
6279       /* Get various versions of the type of the vector of indexes.  */
6280       tree index_vec_type = TREE_TYPE (induction_index);
6281       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6282       tree index_scalar_type = TREE_TYPE (index_vec_type);
6283       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6284
6285       /* Get an unsigned integer version of the type of the data vector.  */
6286       int scalar_precision
6287         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6288       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6289       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6290                                                 vectype);
6291
6292       /* First we need to create a vector (ZERO_VEC) of zeros and another
6293          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6294          can create using a MAX reduction and then expanding.
6295          In the case where the loop never made any matches, the max index will
6296          be zero.  */
6297
6298       /* Vector of {0, 0, 0,...}.  */
6299       tree zero_vec = build_zero_cst (vectype);
6300
6301       /* Find maximum value from the vector of found indexes.  */
6302       tree max_index = make_ssa_name (index_scalar_type);
6303       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6304                                                           1, induction_index);
6305       gimple_call_set_lhs (max_index_stmt, max_index);
6306       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6307
6308       /* Vector of {max_index, max_index, max_index,...}.  */
6309       tree max_index_vec = make_ssa_name (index_vec_type);
6310       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6311                                                       max_index);
6312       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6313                                                         max_index_vec_rhs);
6314       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6315
6316       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6317          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6318          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6319          otherwise.  Only one value should match, resulting in a vector
6320          (VEC_COND) with one data value and the rest zeros.
6321          In the case where the loop never made any matches, every index will
6322          match, resulting in a vector with all data values (which will all be
6323          the default value).  */
6324
6325       /* Compare the max index vector to the vector of found indexes to find
6326          the position of the max value.  */
6327       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6328       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6329                                                       induction_index,
6330                                                       max_index_vec);
6331       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6332
6333       /* Use the compare to choose either values from the data vector or
6334          zero.  */
6335       tree vec_cond = make_ssa_name (vectype);
6336       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6337                                                    vec_compare,
6338                                                    reduc_inputs[0],
6339                                                    zero_vec);
6340       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6341
6342       /* Finally we need to extract the data value from the vector (VEC_COND)
6343          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6344          reduction, but because this doesn't exist, we can use a MAX reduction
6345          instead.  The data value might be signed or a float so we need to cast
6346          it first.
6347          In the case where the loop never made any matches, the data values are
6348          all identical, and so will reduce down correctly.  */
6349
6350       /* Make the matched data values unsigned.  */
6351       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6352       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6353                                        vec_cond);
6354       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6355                                                         VIEW_CONVERT_EXPR,
6356                                                         vec_cond_cast_rhs);
6357       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6358
6359       /* Reduce down to a scalar value.  */
6360       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6361       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6362                                                            1, vec_cond_cast);
6363       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6364       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6365
6366       /* Convert the reduced value back to the result type and set as the
6367          result.  */
6368       gimple_seq stmts = NULL;
6369       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6370                                data_reduc);
6371       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6372       scalar_results.safe_push (new_temp);
6373     }
6374   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6375            && reduc_fn == IFN_LAST)
6376     {
6377       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6378          idx = 0;
6379          idx_val = induction_index[0];
6380          val = data_reduc[0];
6381          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6382            if (induction_index[i] > idx_val)
6383              val = data_reduc[i], idx_val = induction_index[i];
6384          return val;  */
6385
6386       tree data_eltype = TREE_TYPE (vectype);
6387       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6388       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6389       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6390       /* Enforced by vectorizable_reduction, which ensures we have target
6391          support before allowing a conditional reduction on variable-length
6392          vectors.  */
6393       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6394       tree idx_val = NULL_TREE, val = NULL_TREE;
6395       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6396         {
6397           tree old_idx_val = idx_val;
6398           tree old_val = val;
6399           idx_val = make_ssa_name (idx_eltype);
6400           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6401                                              build3 (BIT_FIELD_REF, idx_eltype,
6402                                                      induction_index,
6403                                                      bitsize_int (el_size),
6404                                                      bitsize_int (off)));
6405           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6406           val = make_ssa_name (data_eltype);
6407           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6408                                              build3 (BIT_FIELD_REF,
6409                                                      data_eltype,
6410                                                      reduc_inputs[0],
6411                                                      bitsize_int (el_size),
6412                                                      bitsize_int (off)));
6413           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6414           if (off != 0)
6415             {
6416               tree new_idx_val = idx_val;
6417               if (off != v_size - el_size)
6418                 {
6419                   new_idx_val = make_ssa_name (idx_eltype);
6420                   epilog_stmt = gimple_build_assign (new_idx_val,
6421                                                      MAX_EXPR, idx_val,
6422                                                      old_idx_val);
6423                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6424                 }
6425               tree cond = make_ssa_name (boolean_type_node);
6426               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6427                                                  idx_val, old_idx_val);
6428               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6429               tree new_val = make_ssa_name (data_eltype);
6430               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6431                                                  cond, val, old_val);
6432               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6433               idx_val = new_idx_val;
6434               val = new_val;
6435             }
6436         }
6437       /* Convert the reduced value back to the result type and set as the
6438          result.  */
6439       gimple_seq stmts = NULL;
6440       val = gimple_convert (&stmts, scalar_type, val);
6441       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6442       scalar_results.safe_push (val);
6443     }
6444
6445   /* 2.3 Create the reduction code, using one of the three schemes described
6446          above. In SLP we simply need to extract all the elements from the
6447          vector (without reducing them), so we use scalar shifts.  */
6448   else if (reduc_fn != IFN_LAST && !slp_reduc)
6449     {
6450       tree tmp;
6451       tree vec_elem_type;
6452
6453       /* Case 1:  Create:
6454          v_out2 = reduc_expr <v_out1>  */
6455
6456       if (dump_enabled_p ())
6457         dump_printf_loc (MSG_NOTE, vect_location,
6458                          "Reduce using direct vector reduction.\n");
6459
6460       gimple_seq stmts = NULL;
6461       vec_elem_type = TREE_TYPE (vectype);
6462       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6463                                vec_elem_type, reduc_inputs[0]);
6464       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6465       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6466
6467       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6468           && induc_val)
6469         {
6470           /* Earlier we set the initial value to be a vector if induc_val
6471              values.  Check the result and if it is induc_val then replace
6472              with the original initial value, unless induc_val is
6473              the same as initial_def already.  */
6474           tree zcompare = make_ssa_name (boolean_type_node);
6475           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6476                                              new_temp, induc_val);
6477           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6478           tree initial_def = reduc_info->reduc_initial_values[0];
6479           tmp = make_ssa_name (new_scalar_dest);
6480           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6481                                              initial_def, new_temp);
6482           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6483           new_temp = tmp;
6484         }
6485
6486       scalar_results.safe_push (new_temp);
6487     }
6488   else if (direct_slp_reduc)
6489     {
6490       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6491          with the elements for other SLP statements replaced with the
6492          neutral value.  We can then do a normal reduction on each vector.  */
6493
6494       /* Enforced by vectorizable_reduction.  */
6495       gcc_assert (reduc_inputs.length () == 1);
6496       gcc_assert (pow2p_hwi (group_size));
6497
6498       gimple_seq seq = NULL;
6499
6500       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6501          and the same element size as VECTYPE.  */
6502       tree index = build_index_vector (vectype, 0, 1);
6503       tree index_type = TREE_TYPE (index);
6504       tree index_elt_type = TREE_TYPE (index_type);
6505       tree mask_type = truth_type_for (index_type);
6506
6507       /* Create a vector that, for each element, identifies which of
6508          the REDUC_GROUP_SIZE results should use it.  */
6509       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6510       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6511                             build_vector_from_val (index_type, index_mask));
6512
6513       /* Get a neutral vector value.  This is simply a splat of the neutral
6514          scalar value if we have one, otherwise the initial scalar value
6515          is itself a neutral value.  */
6516       tree vector_identity = NULL_TREE;
6517       tree neutral_op = NULL_TREE;
6518       if (slp_node)
6519         {
6520           tree initial_value = NULL_TREE;
6521           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6522             initial_value = reduc_info->reduc_initial_values[0];
6523           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6524                                                  initial_value, false);
6525         }
6526       if (neutral_op)
6527         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6528                                                         neutral_op);
6529       for (unsigned int i = 0; i < group_size; ++i)
6530         {
6531           /* If there's no univeral neutral value, we can use the
6532              initial scalar value from the original PHI.  This is used
6533              for MIN and MAX reduction, for example.  */
6534           if (!neutral_op)
6535             {
6536               tree scalar_value = reduc_info->reduc_initial_values[i];
6537               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6538                                              scalar_value);
6539               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6540                                                               scalar_value);
6541             }
6542
6543           /* Calculate the equivalent of:
6544
6545              sel[j] = (index[j] == i);
6546
6547              which selects the elements of REDUC_INPUTS[0] that should
6548              be included in the result.  */
6549           tree compare_val = build_int_cst (index_elt_type, i);
6550           compare_val = build_vector_from_val (index_type, compare_val);
6551           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6552                                    index, compare_val);
6553
6554           /* Calculate the equivalent of:
6555
6556              vec = seq ? reduc_inputs[0] : vector_identity;
6557
6558              VEC is now suitable for a full vector reduction.  */
6559           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6560                                    sel, reduc_inputs[0], vector_identity);
6561
6562           /* Do the reduction and convert it to the appropriate type.  */
6563           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6564                                       TREE_TYPE (vectype), vec);
6565           scalar = gimple_convert (&seq, scalar_type, scalar);
6566           scalar_results.safe_push (scalar);
6567         }
6568       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6569     }
6570   else
6571     {
6572       bool reduce_with_shift;
6573       tree vec_temp;
6574
6575       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6576
6577       /* See if the target wants to do the final (shift) reduction
6578          in a vector mode of smaller size and first reduce upper/lower
6579          halves against each other.  */
6580       enum machine_mode mode1 = mode;
6581       tree stype = TREE_TYPE (vectype);
6582       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6583       unsigned nunits1 = nunits;
6584       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6585           && reduc_inputs.length () == 1)
6586         {
6587           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6588           /* For SLP reductions we have to make sure lanes match up, but
6589              since we're doing individual element final reduction reducing
6590              vector width here is even more important.
6591              ???  We can also separate lanes with permutes, for the common
6592              case of power-of-two group-size odd/even extracts would work.  */
6593           if (slp_reduc && nunits != nunits1)
6594             {
6595               nunits1 = least_common_multiple (nunits1, group_size);
6596               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6597             }
6598         }
6599       if (!slp_reduc
6600           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6601         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6602
6603       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6604                                                            stype, nunits1);
6605       reduce_with_shift = have_whole_vector_shift (mode1);
6606       if (!VECTOR_MODE_P (mode1)
6607           || !directly_supported_p (code, vectype1))
6608         reduce_with_shift = false;
6609
6610       /* First reduce the vector to the desired vector size we should
6611          do shift reduction on by combining upper and lower halves.  */
6612       gimple_seq stmts = NULL;
6613       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6614                                              code, &stmts);
6615       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6616       reduc_inputs[0] = new_temp;
6617
6618       if (reduce_with_shift && !slp_reduc)
6619         {
6620           int element_bitsize = tree_to_uhwi (bitsize);
6621           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6622              for variable-length vectors and also requires direct target support
6623              for loop reductions.  */
6624           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6625           int nelements = vec_size_in_bits / element_bitsize;
6626           vec_perm_builder sel;
6627           vec_perm_indices indices;
6628
6629           int elt_offset;
6630
6631           tree zero_vec = build_zero_cst (vectype1);
6632           /* Case 2: Create:
6633              for (offset = nelements/2; offset >= 1; offset/=2)
6634                 {
6635                   Create:  va' = vec_shift <va, offset>
6636                   Create:  va = vop <va, va'>
6637                 }  */
6638
6639           tree rhs;
6640
6641           if (dump_enabled_p ())
6642             dump_printf_loc (MSG_NOTE, vect_location,
6643                              "Reduce using vector shifts\n");
6644
6645           gimple_seq stmts = NULL;
6646           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6647           for (elt_offset = nelements / 2;
6648                elt_offset >= 1;
6649                elt_offset /= 2)
6650             {
6651               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6652               indices.new_vector (sel, 2, nelements);
6653               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6654               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6655                                        new_temp, zero_vec, mask);
6656               new_temp = gimple_build (&stmts, code,
6657                                        vectype1, new_name, new_temp);
6658             }
6659           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6660
6661           /* 2.4  Extract the final scalar result.  Create:
6662              s_out3 = extract_field <v_out2, bitpos>  */
6663
6664           if (dump_enabled_p ())
6665             dump_printf_loc (MSG_NOTE, vect_location,
6666                              "extract scalar result\n");
6667
6668           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6669                         bitsize, bitsize_zero_node);
6670           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6671           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6672           gimple_assign_set_lhs (epilog_stmt, new_temp);
6673           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6674           scalar_results.safe_push (new_temp);
6675         }
6676       else
6677         {
6678           /* Case 3: Create:
6679              s = extract_field <v_out2, 0>
6680              for (offset = element_size;
6681                   offset < vector_size;
6682                   offset += element_size;)
6683                {
6684                  Create:  s' = extract_field <v_out2, offset>
6685                  Create:  s = op <s, s'>  // For non SLP cases
6686                }  */
6687
6688           if (dump_enabled_p ())
6689             dump_printf_loc (MSG_NOTE, vect_location,
6690                              "Reduce using scalar code.\n");
6691
6692           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6693           int element_bitsize = tree_to_uhwi (bitsize);
6694           tree compute_type = TREE_TYPE (vectype);
6695           gimple_seq stmts = NULL;
6696           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6697             {
6698               int bit_offset;
6699               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6700                                        vec_temp, bitsize, bitsize_zero_node);
6701
6702               /* In SLP we don't need to apply reduction operation, so we just
6703                  collect s' values in SCALAR_RESULTS.  */
6704               if (slp_reduc)
6705                 scalar_results.safe_push (new_temp);
6706
6707               for (bit_offset = element_bitsize;
6708                    bit_offset < vec_size_in_bits;
6709                    bit_offset += element_bitsize)
6710                 {
6711                   tree bitpos = bitsize_int (bit_offset);
6712                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6713                                            compute_type, vec_temp,
6714                                            bitsize, bitpos);
6715                   if (slp_reduc)
6716                     {
6717                       /* In SLP we don't need to apply reduction operation, so
6718                          we just collect s' values in SCALAR_RESULTS.  */
6719                       new_temp = new_name;
6720                       scalar_results.safe_push (new_name);
6721                     }
6722                   else
6723                     new_temp = gimple_build (&stmts, code, compute_type,
6724                                              new_name, new_temp);
6725                 }
6726             }
6727
6728           /* The only case where we need to reduce scalar results in SLP, is
6729              unrolling.  If the size of SCALAR_RESULTS is greater than
6730              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6731              REDUC_GROUP_SIZE.  */
6732           if (slp_reduc)
6733             {
6734               tree res, first_res, new_res;
6735
6736               /* Reduce multiple scalar results in case of SLP unrolling.  */
6737               for (j = group_size; scalar_results.iterate (j, &res);
6738                    j++)
6739                 {
6740                   first_res = scalar_results[j % group_size];
6741                   new_res = gimple_build (&stmts, code, compute_type,
6742                                           first_res, res);
6743                   scalar_results[j % group_size] = new_res;
6744                 }
6745               scalar_results.truncate (group_size);
6746               for (k = 0; k < group_size; k++)
6747                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6748                                                     scalar_results[k]);
6749             }
6750           else
6751             {
6752               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6753               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6754               scalar_results.safe_push (new_temp);
6755             }
6756
6757           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6758         }
6759
6760       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6761           && induc_val)
6762         {
6763           /* Earlier we set the initial value to be a vector if induc_val
6764              values.  Check the result and if it is induc_val then replace
6765              with the original initial value, unless induc_val is
6766              the same as initial_def already.  */
6767           tree zcompare = make_ssa_name (boolean_type_node);
6768           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6769                                              induc_val);
6770           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6771           tree initial_def = reduc_info->reduc_initial_values[0];
6772           tree tmp = make_ssa_name (new_scalar_dest);
6773           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6774                                              initial_def, new_temp);
6775           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6776           scalar_results[0] = tmp;
6777         }
6778     }
6779
6780   /* 2.5 Adjust the final result by the initial value of the reduction
6781          variable. (When such adjustment is not needed, then
6782          'adjustment_def' is zero).  For example, if code is PLUS we create:
6783          new_temp = loop_exit_def + adjustment_def  */
6784
6785   if (adjustment_def)
6786     {
6787       gcc_assert (!slp_reduc);
6788       gimple_seq stmts = NULL;
6789       if (double_reduc)
6790         {
6791           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6792           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6793           new_temp = gimple_build (&stmts, code, vectype,
6794                                    reduc_inputs[0], adjustment_def);
6795         }
6796       else
6797         {
6798           new_temp = scalar_results[0];
6799           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6800           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6801                                            adjustment_def);
6802           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6803           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6804                                    new_temp, adjustment_def);
6805           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6806         }
6807
6808       epilog_stmt = gimple_seq_last_stmt (stmts);
6809       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6810       scalar_results[0] = new_temp;
6811     }
6812
6813   /* Record this operation if it could be reused by the epilogue loop.  */
6814   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6815       && reduc_inputs.length () == 1)
6816     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6817                                            { orig_reduc_input, reduc_info });
6818
6819   if (double_reduc)
6820     loop = outer_loop;
6821
6822   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6823           phis with new adjusted scalar results, i.e., replace use <s_out0>
6824           with use <s_out4>.
6825
6826      Transform:
6827         loop_exit:
6828           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6829           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6830           v_out2 = reduce <v_out1>
6831           s_out3 = extract_field <v_out2, 0>
6832           s_out4 = adjust_result <s_out3>
6833           use <s_out0>
6834           use <s_out0>
6835
6836      into:
6837
6838         loop_exit:
6839           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6840           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6841           v_out2 = reduce <v_out1>
6842           s_out3 = extract_field <v_out2, 0>
6843           s_out4 = adjust_result <s_out3>
6844           use <s_out4>
6845           use <s_out4> */
6846
6847   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6848   for (k = 0; k < live_out_stmts.size (); k++)
6849     {
6850       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6851       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6852
6853       phis.create (3);
6854       /* Find the loop-closed-use at the loop exit of the original scalar
6855          result.  (The reduction result is expected to have two immediate uses,
6856          one at the latch block, and one at the loop exit).  For double
6857          reductions we are looking for exit phis of the outer loop.  */
6858       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6859         {
6860           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6861             {
6862               if (!is_gimple_debug (USE_STMT (use_p)))
6863                 phis.safe_push (USE_STMT (use_p));
6864             }
6865           else
6866             {
6867               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6868                 {
6869                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6870
6871                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6872                     {
6873                       if (!flow_bb_inside_loop_p (loop,
6874                                              gimple_bb (USE_STMT (phi_use_p)))
6875                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6876                         phis.safe_push (USE_STMT (phi_use_p));
6877                     }
6878                 }
6879             }
6880         }
6881
6882       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6883         {
6884           /* Replace the uses:  */
6885           orig_name = PHI_RESULT (exit_phi);
6886
6887           /* Look for a single use at the target of the skip edge.  */
6888           if (unify_with_main_loop_p)
6889             {
6890               use_operand_p use_p;
6891               gimple *user;
6892               if (!single_imm_use (orig_name, &use_p, &user))
6893                 gcc_unreachable ();
6894               orig_name = gimple_get_lhs (user);
6895             }
6896
6897           scalar_result = scalar_results[k];
6898           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6899             {
6900               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6901                 SET_USE (use_p, scalar_result);
6902               update_stmt (use_stmt);
6903             }
6904         }
6905
6906       phis.release ();
6907     }
6908 }
6909
6910 /* Return a vector of type VECTYPE that is equal to the vector select
6911    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6912    before GSI.  */
6913
6914 static tree
6915 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6916                      tree vec, tree identity)
6917 {
6918   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6919   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6920                                           mask, vec, identity);
6921   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6922   return cond;
6923 }
6924
6925 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6926    order, starting with LHS.  Insert the extraction statements before GSI and
6927    associate the new scalar SSA names with variable SCALAR_DEST.
6928    If MASK is nonzero mask the input and then operate on it unconditionally.
6929    Return the SSA name for the result.  */
6930
6931 static tree
6932 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6933                        tree_code code, tree lhs, tree vector_rhs,
6934                        tree mask)
6935 {
6936   tree vectype = TREE_TYPE (vector_rhs);
6937   tree scalar_type = TREE_TYPE (vectype);
6938   tree bitsize = TYPE_SIZE (scalar_type);
6939   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6940   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6941
6942   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6943      to perform an unconditional element-wise reduction of it.  */
6944   if (mask)
6945     {
6946       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6947                                                    "masked_vector_rhs");
6948       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6949                                                   false);
6950       tree vector_identity = build_vector_from_val (vectype, neutral_op);
6951       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6952                                              mask, vector_rhs, vector_identity);
6953       gsi_insert_before (gsi, select, GSI_SAME_STMT);
6954       vector_rhs = masked_vector_rhs;
6955     }
6956
6957   for (unsigned HOST_WIDE_INT bit_offset = 0;
6958        bit_offset < vec_size_in_bits;
6959        bit_offset += element_bitsize)
6960     {
6961       tree bitpos = bitsize_int (bit_offset);
6962       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6963                          bitsize, bitpos);
6964
6965       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6966       rhs = make_ssa_name (scalar_dest, stmt);
6967       gimple_assign_set_lhs (stmt, rhs);
6968       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6969
6970       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6971       tree new_name = make_ssa_name (scalar_dest, stmt);
6972       gimple_assign_set_lhs (stmt, new_name);
6973       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6974       lhs = new_name;
6975     }
6976   return lhs;
6977 }
6978
6979 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6980    type of the vector input.  */
6981
6982 static internal_fn
6983 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6984 {
6985   internal_fn mask_reduc_fn;
6986   internal_fn mask_len_reduc_fn;
6987
6988   switch (reduc_fn)
6989     {
6990     case IFN_FOLD_LEFT_PLUS:
6991       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6992       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6993       break;
6994
6995     default:
6996       return IFN_LAST;
6997     }
6998
6999   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7000                                       OPTIMIZE_FOR_SPEED))
7001     return mask_reduc_fn;
7002   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7003                                       OPTIMIZE_FOR_SPEED))
7004     return mask_len_reduc_fn;
7005   return IFN_LAST;
7006 }
7007
7008 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7009    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7010    statement.  CODE is the operation performed by STMT_INFO and OPS are
7011    its scalar operands.  REDUC_INDEX is the index of the operand in
7012    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7013    implements in-order reduction, or IFN_LAST if we should open-code it.
7014    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7015    that should be used to control the operation in a fully-masked loop.  */
7016
7017 static bool
7018 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7019                                stmt_vec_info stmt_info,
7020                                gimple_stmt_iterator *gsi,
7021                                gimple **vec_stmt, slp_tree slp_node,
7022                                gimple *reduc_def_stmt,
7023                                code_helper code, internal_fn reduc_fn,
7024                                tree *ops, int num_ops, tree vectype_in,
7025                                int reduc_index, vec_loop_masks *masks,
7026                                vec_loop_lens *lens)
7027 {
7028   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7029   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7030   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7031
7032   int ncopies;
7033   if (slp_node)
7034     ncopies = 1;
7035   else
7036     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7037
7038   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7039   gcc_assert (ncopies == 1);
7040
7041   bool is_cond_op = false;
7042   if (!code.is_tree_code ())
7043     {
7044       code = conditional_internal_fn_code (internal_fn (code));
7045       gcc_assert (code != ERROR_MARK);
7046       is_cond_op = true;
7047     }
7048
7049   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7050
7051   if (slp_node)
7052     {
7053       if (is_cond_op)
7054         {
7055           if (dump_enabled_p ())
7056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7057                              "fold-left reduction on SLP not supported.\n");
7058           return false;
7059         }
7060
7061       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7062                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7063     }
7064
7065   /* The operands either come from a binary operation or an IFN_COND operation.
7066      The former is a gimple assign with binary rhs and the latter is a
7067      gimple call with four arguments.  */
7068   gcc_assert (num_ops == 2 || num_ops == 4);
7069   tree op0, opmask;
7070   if (!is_cond_op)
7071     op0 = ops[1 - reduc_index];
7072   else
7073     {
7074       op0 = ops[2 + (1 - reduc_index)];
7075       opmask = ops[0];
7076       gcc_assert (!slp_node);
7077     }
7078
7079   int group_size = 1;
7080   stmt_vec_info scalar_dest_def_info;
7081   auto_vec<tree> vec_oprnds0, vec_opmask;
7082   if (slp_node)
7083     {
7084       auto_vec<vec<tree> > vec_defs (2);
7085       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7086       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7087       vec_defs[0].release ();
7088       vec_defs[1].release ();
7089       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7090       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7091     }
7092   else
7093     {
7094       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7095                                      op0, &vec_oprnds0);
7096       scalar_dest_def_info = stmt_info;
7097
7098       /* For an IFN_COND_OP we also need the vector mask operand.  */
7099       if (is_cond_op)
7100           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7101                                          opmask, &vec_opmask);
7102     }
7103
7104   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7105   tree scalar_dest = gimple_get_lhs (sdef);
7106   tree scalar_type = TREE_TYPE (scalar_dest);
7107   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7108
7109   int vec_num = vec_oprnds0.length ();
7110   gcc_assert (vec_num == 1 || slp_node);
7111   tree vec_elem_type = TREE_TYPE (vectype_out);
7112   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7113
7114   tree vector_identity = NULL_TREE;
7115   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7116     {
7117       vector_identity = build_zero_cst (vectype_out);
7118       if (!HONOR_SIGNED_ZEROS (vectype_out))
7119         ;
7120       else
7121         {
7122           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7123           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7124                                         vector_identity);
7125         }
7126     }
7127
7128   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7129   int i;
7130   tree def0;
7131   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7132     {
7133       gimple *new_stmt;
7134       tree mask = NULL_TREE;
7135       tree len = NULL_TREE;
7136       tree bias = NULL_TREE;
7137       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7138         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7139       else if (is_cond_op)
7140         mask = vec_opmask[0];
7141       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7142         {
7143           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7144                                    i, 1);
7145           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7146           bias = build_int_cst (intQI_type_node, biasval);
7147           if (!is_cond_op)
7148             mask = build_minus_one_cst (truth_type_for (vectype_in));
7149         }
7150
7151       /* Handle MINUS by adding the negative.  */
7152       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7153         {
7154           tree negated = make_ssa_name (vectype_out);
7155           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7156           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7157           def0 = negated;
7158         }
7159
7160       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7161           && mask && mask_reduc_fn == IFN_LAST)
7162         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7163                                     vector_identity);
7164
7165       /* On the first iteration the input is simply the scalar phi
7166          result, and for subsequent iterations it is the output of
7167          the preceding operation.  */
7168       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7169         {
7170           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7171             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7172                                                    def0, mask, len, bias);
7173           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7174             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7175                                                    def0, mask);
7176           else
7177             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7178                                                    def0);
7179           /* For chained SLP reductions the output of the previous reduction
7180              operation serves as the input of the next. For the final statement
7181              the output cannot be a temporary - we reuse the original
7182              scalar destination of the last statement.  */
7183           if (i != vec_num - 1)
7184             {
7185               gimple_set_lhs (new_stmt, scalar_dest_var);
7186               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7187               gimple_set_lhs (new_stmt, reduc_var);
7188             }
7189         }
7190       else
7191         {
7192           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7193                                              tree_code (code), reduc_var, def0,
7194                                              mask);
7195           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7196           /* Remove the statement, so that we can use the same code paths
7197              as for statements that we've just created.  */
7198           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7199           gsi_remove (&tmp_gsi, true);
7200         }
7201
7202       if (i == vec_num - 1)
7203         {
7204           gimple_set_lhs (new_stmt, scalar_dest);
7205           vect_finish_replace_stmt (loop_vinfo,
7206                                     scalar_dest_def_info,
7207                                     new_stmt);
7208         }
7209       else
7210         vect_finish_stmt_generation (loop_vinfo,
7211                                      scalar_dest_def_info,
7212                                      new_stmt, gsi);
7213
7214       if (slp_node)
7215         slp_node->push_vec_def (new_stmt);
7216       else
7217         {
7218           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7219           *vec_stmt = new_stmt;
7220         }
7221     }
7222
7223   return true;
7224 }
7225
7226 /* Function is_nonwrapping_integer_induction.
7227
7228    Check if STMT_VINO (which is part of loop LOOP) both increments and
7229    does not cause overflow.  */
7230
7231 static bool
7232 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7233 {
7234   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7235   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7236   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7237   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7238   widest_int ni, max_loop_value, lhs_max;
7239   wi::overflow_type overflow = wi::OVF_NONE;
7240
7241   /* Make sure the loop is integer based.  */
7242   if (TREE_CODE (base) != INTEGER_CST
7243       || TREE_CODE (step) != INTEGER_CST)
7244     return false;
7245
7246   /* Check that the max size of the loop will not wrap.  */
7247
7248   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7249     return true;
7250
7251   if (! max_stmt_executions (loop, &ni))
7252     return false;
7253
7254   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7255                             &overflow);
7256   if (overflow)
7257     return false;
7258
7259   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7260                             TYPE_SIGN (lhs_type), &overflow);
7261   if (overflow)
7262     return false;
7263
7264   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7265           <= TYPE_PRECISION (lhs_type));
7266 }
7267
7268 /* Check if masking can be supported by inserting a conditional expression.
7269    CODE is the code for the operation.  COND_FN is the conditional internal
7270    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7271 static bool
7272 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7273                          tree vectype_in)
7274 {
7275   if (cond_fn != IFN_LAST
7276       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7277                                          OPTIMIZE_FOR_SPEED))
7278     return false;
7279
7280   if (code.is_tree_code ())
7281     switch (tree_code (code))
7282       {
7283       case DOT_PROD_EXPR:
7284       case SAD_EXPR:
7285         return true;
7286
7287       default:
7288         break;
7289       }
7290   return false;
7291 }
7292
7293 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7294    code for the operation.  VOP is the array of operands.  MASK is the loop
7295    mask.  GSI is a statement iterator used to place the new conditional
7296    expression.  */
7297 static void
7298 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7299                       gimple_stmt_iterator *gsi)
7300 {
7301   switch (tree_code (code))
7302     {
7303     case DOT_PROD_EXPR:
7304       {
7305         tree vectype = TREE_TYPE (vop[1]);
7306         tree zero = build_zero_cst (vectype);
7307         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7308         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7309                                                mask, vop[1], zero);
7310         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7311         vop[1] = masked_op1;
7312         break;
7313       }
7314
7315     case SAD_EXPR:
7316       {
7317         tree vectype = TREE_TYPE (vop[1]);
7318         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7319         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7320                                                mask, vop[1], vop[0]);
7321         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7322         vop[1] = masked_op1;
7323         break;
7324       }
7325
7326     default:
7327       gcc_unreachable ();
7328     }
7329 }
7330
7331 /* Function vectorizable_reduction.
7332
7333    Check if STMT_INFO performs a reduction operation that can be vectorized.
7334    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7335    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7336    Return true if STMT_INFO is vectorizable in this way.
7337
7338    This function also handles reduction idioms (patterns) that have been
7339    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7340    may be of this form:
7341      X = pattern_expr (arg0, arg1, ..., X)
7342    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7343    sequence that had been detected and replaced by the pattern-stmt
7344    (STMT_INFO).
7345
7346    This function also handles reduction of condition expressions, for example:
7347      for (int i = 0; i < N; i++)
7348        if (a[i] < value)
7349          last = a[i];
7350    This is handled by vectorising the loop and creating an additional vector
7351    containing the loop indexes for which "a[i] < value" was true.  In the
7352    function epilogue this is reduced to a single max value and then used to
7353    index into the vector of results.
7354
7355    In some cases of reduction patterns, the type of the reduction variable X is
7356    different than the type of the other arguments of STMT_INFO.
7357    In such cases, the vectype that is used when transforming STMT_INFO into
7358    a vector stmt is different than the vectype that is used to determine the
7359    vectorization factor, because it consists of a different number of elements
7360    than the actual number of elements that are being operated upon in parallel.
7361
7362    For example, consider an accumulation of shorts into an int accumulator.
7363    On some targets it's possible to vectorize this pattern operating on 8
7364    shorts at a time (hence, the vectype for purposes of determining the
7365    vectorization factor should be V8HI); on the other hand, the vectype that
7366    is used to create the vector form is actually V4SI (the type of the result).
7367
7368    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7369    indicates what is the actual level of parallelism (V8HI in the example), so
7370    that the right vectorization factor would be derived.  This vectype
7371    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7372    be used to create the vectorized stmt.  The right vectype for the vectorized
7373    stmt is obtained from the type of the result X:
7374       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7375
7376    This means that, contrary to "regular" reductions (or "regular" stmts in
7377    general), the following equation:
7378       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7379    does *NOT* necessarily hold for reduction patterns.  */
7380
7381 bool
7382 vectorizable_reduction (loop_vec_info loop_vinfo,
7383                         stmt_vec_info stmt_info, slp_tree slp_node,
7384                         slp_instance slp_node_instance,
7385                         stmt_vector_for_cost *cost_vec)
7386 {
7387   tree vectype_in = NULL_TREE;
7388   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7389   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7390   stmt_vec_info cond_stmt_vinfo = NULL;
7391   int i;
7392   int ncopies;
7393   bool single_defuse_cycle = false;
7394   bool nested_cycle = false;
7395   bool double_reduc = false;
7396   int vec_num;
7397   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7398   tree cond_reduc_val = NULL_TREE;
7399
7400   /* Make sure it was already recognized as a reduction computation.  */
7401   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7402       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7403       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7404     return false;
7405
7406   /* The stmt we store reduction analysis meta on.  */
7407   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7408   reduc_info->is_reduc_info = true;
7409
7410   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7411     {
7412       if (is_a <gphi *> (stmt_info->stmt))
7413         {
7414           if (slp_node)
7415             {
7416               /* We eventually need to set a vector type on invariant
7417                  arguments.  */
7418               unsigned j;
7419               slp_tree child;
7420               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7421                 if (!vect_maybe_update_slp_op_vectype
7422                        (child, SLP_TREE_VECTYPE (slp_node)))
7423                   {
7424                     if (dump_enabled_p ())
7425                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7426                                        "incompatible vector types for "
7427                                        "invariants\n");
7428                     return false;
7429                   }
7430             }
7431           /* Analysis for double-reduction is done on the outer
7432              loop PHI, nested cycles have no further restrictions.  */
7433           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7434         }
7435       else
7436         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7437       return true;
7438     }
7439
7440   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7441   stmt_vec_info phi_info = stmt_info;
7442   if (!is_a <gphi *> (stmt_info->stmt))
7443     {
7444       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7445       return true;
7446     }
7447   if (slp_node)
7448     {
7449       slp_node_instance->reduc_phis = slp_node;
7450       /* ???  We're leaving slp_node to point to the PHIs, we only
7451          need it to get at the number of vector stmts which wasn't
7452          yet initialized for the instance root.  */
7453     }
7454   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7455     {
7456       use_operand_p use_p;
7457       gimple *use_stmt;
7458       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7459                                  &use_p, &use_stmt);
7460       gcc_assert (res);
7461       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7462     }
7463
7464   /* PHIs should not participate in patterns.  */
7465   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7466   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7467
7468   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7469      and compute the reduction chain length.  Discover the real
7470      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7471   tree reduc_def
7472     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7473                              loop_latch_edge
7474                                (gimple_bb (reduc_def_phi)->loop_father));
7475   unsigned reduc_chain_length = 0;
7476   bool only_slp_reduc_chain = true;
7477   stmt_info = NULL;
7478   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7479   while (reduc_def != PHI_RESULT (reduc_def_phi))
7480     {
7481       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7482       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7483       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7484         {
7485           if (dump_enabled_p ())
7486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487                              "reduction chain broken by patterns.\n");
7488           return false;
7489         }
7490       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7491         only_slp_reduc_chain = false;
7492       /* For epilogue generation live members of the chain need
7493          to point back to the PHI via their original stmt for
7494          info_for_reduction to work.  For SLP we need to look at
7495          all lanes here - even though we only will vectorize from
7496          the SLP node with live lane zero the other live lanes also
7497          need to be identified as part of a reduction to be able
7498          to skip code generation for them.  */
7499       if (slp_for_stmt_info)
7500         {
7501           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7502             if (STMT_VINFO_LIVE_P (s))
7503               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7504         }
7505       else if (STMT_VINFO_LIVE_P (vdef))
7506         STMT_VINFO_REDUC_DEF (def) = phi_info;
7507       gimple_match_op op;
7508       if (!gimple_extract_op (vdef->stmt, &op))
7509         {
7510           if (dump_enabled_p ())
7511             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7512                              "reduction chain includes unsupported"
7513                              " statement type.\n");
7514           return false;
7515         }
7516       if (CONVERT_EXPR_CODE_P (op.code))
7517         {
7518           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7519             {
7520               if (dump_enabled_p ())
7521                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7522                                  "conversion in the reduction chain.\n");
7523               return false;
7524             }
7525         }
7526       else if (!stmt_info)
7527         /* First non-conversion stmt.  */
7528         stmt_info = vdef;
7529       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7530       reduc_chain_length++;
7531       if (!stmt_info && slp_node)
7532         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7533     }
7534   /* PHIs should not participate in patterns.  */
7535   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7536
7537   if (nested_in_vect_loop_p (loop, stmt_info))
7538     {
7539       loop = loop->inner;
7540       nested_cycle = true;
7541     }
7542
7543   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7544      element.  */
7545   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7546     {
7547       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7548       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7549     }
7550   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7551     gcc_assert (slp_node
7552                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7553
7554   /* 1. Is vectorizable reduction?  */
7555   /* Not supportable if the reduction variable is used in the loop, unless
7556      it's a reduction chain.  */
7557   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7558       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7559     return false;
7560
7561   /* Reductions that are not used even in an enclosing outer-loop,
7562      are expected to be "live" (used out of the loop).  */
7563   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7564       && !STMT_VINFO_LIVE_P (stmt_info))
7565     return false;
7566
7567   /* 2. Has this been recognized as a reduction pattern?
7568
7569      Check if STMT represents a pattern that has been recognized
7570      in earlier analysis stages.  For stmts that represent a pattern,
7571      the STMT_VINFO_RELATED_STMT field records the last stmt in
7572      the original sequence that constitutes the pattern.  */
7573
7574   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7575   if (orig_stmt_info)
7576     {
7577       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7578       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7579     }
7580
7581   /* 3. Check the operands of the operation.  The first operands are defined
7582         inside the loop body. The last operand is the reduction variable,
7583         which is defined by the loop-header-phi.  */
7584
7585   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7586   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7587   gimple_match_op op;
7588   if (!gimple_extract_op (stmt_info->stmt, &op))
7589     gcc_unreachable ();
7590   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7591                             || op.code == WIDEN_SUM_EXPR
7592                             || op.code == SAD_EXPR);
7593
7594   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7595       && !SCALAR_FLOAT_TYPE_P (op.type))
7596     return false;
7597
7598   /* Do not try to vectorize bit-precision reductions.  */
7599   if (!type_has_mode_precision_p (op.type))
7600     return false;
7601
7602   /* For lane-reducing ops we're reducing the number of reduction PHIs
7603      which means the only use of that may be in the lane-reducing operation.  */
7604   if (lane_reduc_code_p
7605       && reduc_chain_length != 1
7606       && !only_slp_reduc_chain)
7607     {
7608       if (dump_enabled_p ())
7609         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7610                          "lane-reducing reduction with extra stmts.\n");
7611       return false;
7612     }
7613
7614   /* All uses but the last are expected to be defined in the loop.
7615      The last use is the reduction variable.  In case of nested cycle this
7616      assumption is not true: we use reduc_index to record the index of the
7617      reduction variable.  */
7618   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7619   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7620   /* We need to skip an extra operand for COND_EXPRs with embedded
7621      comparison.  */
7622   unsigned opno_adjust = 0;
7623   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7624     opno_adjust = 1;
7625   for (i = 0; i < (int) op.num_ops; i++)
7626     {
7627       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7628       if (i == 0 && op.code == COND_EXPR)
7629         continue;
7630
7631       stmt_vec_info def_stmt_info;
7632       enum vect_def_type dt;
7633       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7634                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7635                                &vectype_op[i], &def_stmt_info))
7636         {
7637           if (dump_enabled_p ())
7638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7639                              "use not simple.\n");
7640           return false;
7641         }
7642       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7643         continue;
7644
7645       /* For an IFN_COND_OP we might hit the reduction definition operand
7646          twice (once as definition, once as else).  */
7647       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7648         continue;
7649
7650       /* There should be only one cycle def in the stmt, the one
7651          leading to reduc_def.  */
7652       if (VECTORIZABLE_CYCLE_DEF (dt))
7653         return false;
7654
7655       if (!vectype_op[i])
7656         vectype_op[i]
7657           = get_vectype_for_scalar_type (loop_vinfo,
7658                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7659
7660       /* To properly compute ncopies we are interested in the widest
7661          non-reduction input type in case we're looking at a widening
7662          accumulation that we later handle in vect_transform_reduction.  */
7663       if (lane_reduc_code_p
7664           && vectype_op[i]
7665           && (!vectype_in
7666               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7667                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7668         vectype_in = vectype_op[i];
7669
7670       if (op.code == COND_EXPR)
7671         {
7672           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7673           if (dt == vect_constant_def)
7674             {
7675               cond_reduc_dt = dt;
7676               cond_reduc_val = op.ops[i];
7677             }
7678           if (dt == vect_induction_def
7679               && def_stmt_info
7680               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7681             {
7682               cond_reduc_dt = dt;
7683               cond_stmt_vinfo = def_stmt_info;
7684             }
7685         }
7686     }
7687   if (!vectype_in)
7688     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7689   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7690
7691   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7692   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7693   /* If we have a condition reduction, see if we can simplify it further.  */
7694   if (v_reduc_type == COND_REDUCTION)
7695     {
7696       if (slp_node)
7697         return false;
7698
7699       /* When the condition uses the reduction value in the condition, fail.  */
7700       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7701         {
7702           if (dump_enabled_p ())
7703             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7704                              "condition depends on previous iteration\n");
7705           return false;
7706         }
7707
7708       if (reduc_chain_length == 1
7709           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7710                                               OPTIMIZE_FOR_SPEED)
7711               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7712                                                  vectype_in,
7713                                                  OPTIMIZE_FOR_SPEED)))
7714         {
7715           if (dump_enabled_p ())
7716             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7717                              "optimizing condition reduction with"
7718                              " FOLD_EXTRACT_LAST.\n");
7719           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7720         }
7721       else if (cond_reduc_dt == vect_induction_def)
7722         {
7723           tree base
7724             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7725           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7726
7727           gcc_assert (TREE_CODE (base) == INTEGER_CST
7728                       && TREE_CODE (step) == INTEGER_CST);
7729           cond_reduc_val = NULL_TREE;
7730           enum tree_code cond_reduc_op_code = ERROR_MARK;
7731           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7732           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7733             ;
7734           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7735              above base; punt if base is the minimum value of the type for
7736              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7737           else if (tree_int_cst_sgn (step) == -1)
7738             {
7739               cond_reduc_op_code = MIN_EXPR;
7740               if (tree_int_cst_sgn (base) == -1)
7741                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7742               else if (tree_int_cst_lt (base,
7743                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7744                 cond_reduc_val
7745                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7746             }
7747           else
7748             {
7749               cond_reduc_op_code = MAX_EXPR;
7750               if (tree_int_cst_sgn (base) == 1)
7751                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7752               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7753                                         base))
7754                 cond_reduc_val
7755                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7756             }
7757           if (cond_reduc_val)
7758             {
7759               if (dump_enabled_p ())
7760                 dump_printf_loc (MSG_NOTE, vect_location,
7761                                  "condition expression based on "
7762                                  "integer induction.\n");
7763               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7764               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7765                 = cond_reduc_val;
7766               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7767             }
7768         }
7769       else if (cond_reduc_dt == vect_constant_def)
7770         {
7771           enum vect_def_type cond_initial_dt;
7772           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7773           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7774           if (cond_initial_dt == vect_constant_def
7775               && types_compatible_p (TREE_TYPE (cond_initial_val),
7776                                      TREE_TYPE (cond_reduc_val)))
7777             {
7778               tree e = fold_binary (LE_EXPR, boolean_type_node,
7779                                     cond_initial_val, cond_reduc_val);
7780               if (e && (integer_onep (e) || integer_zerop (e)))
7781                 {
7782                   if (dump_enabled_p ())
7783                     dump_printf_loc (MSG_NOTE, vect_location,
7784                                      "condition expression based on "
7785                                      "compile time constant.\n");
7786                   /* Record reduction code at analysis stage.  */
7787                   STMT_VINFO_REDUC_CODE (reduc_info)
7788                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7789                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7790                 }
7791             }
7792         }
7793     }
7794
7795   if (STMT_VINFO_LIVE_P (phi_info))
7796     return false;
7797
7798   if (slp_node)
7799     ncopies = 1;
7800   else
7801     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7802
7803   gcc_assert (ncopies >= 1);
7804
7805   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7806
7807   if (nested_cycle)
7808     {
7809       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7810                   == vect_double_reduction_def);
7811       double_reduc = true;
7812     }
7813
7814   /* 4.2. Check support for the epilog operation.
7815
7816           If STMT represents a reduction pattern, then the type of the
7817           reduction variable may be different than the type of the rest
7818           of the arguments.  For example, consider the case of accumulation
7819           of shorts into an int accumulator; The original code:
7820                         S1: int_a = (int) short_a;
7821           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7822
7823           was replaced with:
7824                         STMT: int_acc = widen_sum <short_a, int_acc>
7825
7826           This means that:
7827           1. The tree-code that is used to create the vector operation in the
7828              epilog code (that reduces the partial results) is not the
7829              tree-code of STMT, but is rather the tree-code of the original
7830              stmt from the pattern that STMT is replacing.  I.e, in the example
7831              above we want to use 'widen_sum' in the loop, but 'plus' in the
7832              epilog.
7833           2. The type (mode) we use to check available target support
7834              for the vector operation to be created in the *epilog*, is
7835              determined by the type of the reduction variable (in the example
7836              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7837              However the type (mode) we use to check available target support
7838              for the vector operation to be created *inside the loop*, is
7839              determined by the type of the other arguments to STMT (in the
7840              example we'd check this: optab_handler (widen_sum_optab,
7841              vect_short_mode)).
7842
7843           This is contrary to "regular" reductions, in which the types of all
7844           the arguments are the same as the type of the reduction variable.
7845           For "regular" reductions we can therefore use the same vector type
7846           (and also the same tree-code) when generating the epilog code and
7847           when generating the code inside the loop.  */
7848
7849   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7850
7851   /* If conversion might have created a conditional operation like
7852      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7853   if (orig_code.is_internal_fn ())
7854     {
7855       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7856       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7857     }
7858
7859   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7860
7861   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7862   if (reduction_type == TREE_CODE_REDUCTION)
7863     {
7864       /* Check whether it's ok to change the order of the computation.
7865          Generally, when vectorizing a reduction we change the order of the
7866          computation.  This may change the behavior of the program in some
7867          cases, so we need to check that this is ok.  One exception is when
7868          vectorizing an outer-loop: the inner-loop is executed sequentially,
7869          and therefore vectorizing reductions in the inner-loop during
7870          outer-loop vectorization is safe.  Likewise when we are vectorizing
7871          a series of reductions using SLP and the VF is one the reductions
7872          are performed in scalar order.  */
7873       if (slp_node
7874           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7875           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7876         ;
7877       else if (needs_fold_left_reduction_p (op.type, orig_code))
7878         {
7879           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7880              is not directy used in stmt.  */
7881           if (!only_slp_reduc_chain
7882               && reduc_chain_length != 1)
7883             {
7884               if (dump_enabled_p ())
7885                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886                                  "in-order reduction chain without SLP.\n");
7887               return false;
7888             }
7889           STMT_VINFO_REDUC_TYPE (reduc_info)
7890             = reduction_type = FOLD_LEFT_REDUCTION;
7891         }
7892       else if (!commutative_binary_op_p (orig_code, op.type)
7893                || !associative_binary_op_p (orig_code, op.type))
7894         {
7895           if (dump_enabled_p ())
7896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897                             "reduction: not commutative/associative\n");
7898           return false;
7899         }
7900     }
7901
7902   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7903       && ncopies > 1)
7904     {
7905       if (dump_enabled_p ())
7906         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7907                          "multiple types in double reduction or condition "
7908                          "reduction or fold-left reduction.\n");
7909       return false;
7910     }
7911
7912   internal_fn reduc_fn = IFN_LAST;
7913   if (reduction_type == TREE_CODE_REDUCTION
7914       || reduction_type == FOLD_LEFT_REDUCTION
7915       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7916       || reduction_type == CONST_COND_REDUCTION)
7917     {
7918       if (reduction_type == FOLD_LEFT_REDUCTION
7919           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7920           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7921         {
7922           if (reduc_fn != IFN_LAST
7923               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7924                                                   OPTIMIZE_FOR_SPEED))
7925             {
7926               if (dump_enabled_p ())
7927                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928                                  "reduc op not supported by target.\n");
7929
7930               reduc_fn = IFN_LAST;
7931             }
7932         }
7933       else
7934         {
7935           if (!nested_cycle || double_reduc)
7936             {
7937               if (dump_enabled_p ())
7938                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7939                                  "no reduc code for scalar code.\n");
7940
7941               return false;
7942             }
7943         }
7944     }
7945   else if (reduction_type == COND_REDUCTION)
7946     {
7947       int scalar_precision
7948         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7949       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7950       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7951                                                 vectype_out);
7952
7953       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7954                                           OPTIMIZE_FOR_SPEED))
7955         reduc_fn = IFN_REDUC_MAX;
7956     }
7957   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7958
7959   if (reduction_type != EXTRACT_LAST_REDUCTION
7960       && (!nested_cycle || double_reduc)
7961       && reduc_fn == IFN_LAST
7962       && !nunits_out.is_constant ())
7963     {
7964       if (dump_enabled_p ())
7965         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                          "missing target support for reduction on"
7967                          " variable-length vectors.\n");
7968       return false;
7969     }
7970
7971   /* For SLP reductions, see if there is a neutral value we can use.  */
7972   tree neutral_op = NULL_TREE;
7973   if (slp_node)
7974     {
7975       tree initial_value = NULL_TREE;
7976       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7977         initial_value = vect_phi_initial_value (reduc_def_phi);
7978       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7979                                              orig_code, initial_value);
7980     }
7981
7982   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7983     {
7984       /* We can't support in-order reductions of code such as this:
7985
7986            for (int i = 0; i < n1; ++i)
7987              for (int j = 0; j < n2; ++j)
7988                l += a[j];
7989
7990          since GCC effectively transforms the loop when vectorizing:
7991
7992            for (int i = 0; i < n1 / VF; ++i)
7993              for (int j = 0; j < n2; ++j)
7994                for (int k = 0; k < VF; ++k)
7995                  l += a[j];
7996
7997          which is a reassociation of the original operation.  */
7998       if (dump_enabled_p ())
7999         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8000                          "in-order double reduction not supported.\n");
8001
8002       return false;
8003     }
8004
8005   if (reduction_type == FOLD_LEFT_REDUCTION
8006       && slp_node
8007       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8008     {
8009       /* We cannot use in-order reductions in this case because there is
8010          an implicit reassociation of the operations involved.  */
8011       if (dump_enabled_p ())
8012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8013                          "in-order unchained SLP reductions not supported.\n");
8014       return false;
8015     }
8016
8017   /* For double reductions, and for SLP reductions with a neutral value,
8018      we construct a variable-length initial vector by loading a vector
8019      full of the neutral value and then shift-and-inserting the start
8020      values into the low-numbered elements.  */
8021   if ((double_reduc || neutral_op)
8022       && !nunits_out.is_constant ()
8023       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8024                                           vectype_out, OPTIMIZE_FOR_SPEED))
8025     {
8026       if (dump_enabled_p ())
8027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028                          "reduction on variable-length vectors requires"
8029                          " target support for a vector-shift-and-insert"
8030                          " operation.\n");
8031       return false;
8032     }
8033
8034   /* Check extra constraints for variable-length unchained SLP reductions.  */
8035   if (slp_node
8036       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8037       && !nunits_out.is_constant ())
8038     {
8039       /* We checked above that we could build the initial vector when
8040          there's a neutral element value.  Check here for the case in
8041          which each SLP statement has its own initial value and in which
8042          that value needs to be repeated for every instance of the
8043          statement within the initial vector.  */
8044       unsigned int group_size = SLP_TREE_LANES (slp_node);
8045       if (!neutral_op
8046           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8047                                               TREE_TYPE (vectype_out)))
8048         {
8049           if (dump_enabled_p ())
8050             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051                              "unsupported form of SLP reduction for"
8052                              " variable-length vectors: cannot build"
8053                              " initial vector.\n");
8054           return false;
8055         }
8056       /* The epilogue code relies on the number of elements being a multiple
8057          of the group size.  The duplicate-and-interleave approach to setting
8058          up the initial vector does too.  */
8059       if (!multiple_p (nunits_out, group_size))
8060         {
8061           if (dump_enabled_p ())
8062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063                              "unsupported form of SLP reduction for"
8064                              " variable-length vectors: the vector size"
8065                              " is not a multiple of the number of results.\n");
8066           return false;
8067         }
8068     }
8069
8070   if (reduction_type == COND_REDUCTION)
8071     {
8072       widest_int ni;
8073
8074       if (! max_loop_iterations (loop, &ni))
8075         {
8076           if (dump_enabled_p ())
8077             dump_printf_loc (MSG_NOTE, vect_location,
8078                              "loop count not known, cannot create cond "
8079                              "reduction.\n");
8080           return false;
8081         }
8082       /* Convert backedges to iterations.  */
8083       ni += 1;
8084
8085       /* The additional index will be the same type as the condition.  Check
8086          that the loop can fit into this less one (because we'll use up the
8087          zero slot for when there are no matches).  */
8088       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8089       if (wi::geu_p (ni, wi::to_widest (max_index)))
8090         {
8091           if (dump_enabled_p ())
8092             dump_printf_loc (MSG_NOTE, vect_location,
8093                              "loop size is greater than data size.\n");
8094           return false;
8095         }
8096     }
8097
8098   /* In case the vectorization factor (VF) is bigger than the number
8099      of elements that we can fit in a vectype (nunits), we have to generate
8100      more than one vector stmt - i.e - we need to "unroll" the
8101      vector stmt by a factor VF/nunits.  For more details see documentation
8102      in vectorizable_operation.  */
8103
8104   /* If the reduction is used in an outer loop we need to generate
8105      VF intermediate results, like so (e.g. for ncopies=2):
8106         r0 = phi (init, r0)
8107         r1 = phi (init, r1)
8108         r0 = x0 + r0;
8109         r1 = x1 + r1;
8110     (i.e. we generate VF results in 2 registers).
8111     In this case we have a separate def-use cycle for each copy, and therefore
8112     for each copy we get the vector def for the reduction variable from the
8113     respective phi node created for this copy.
8114
8115     Otherwise (the reduction is unused in the loop nest), we can combine
8116     together intermediate results, like so (e.g. for ncopies=2):
8117         r = phi (init, r)
8118         r = x0 + r;
8119         r = x1 + r;
8120    (i.e. we generate VF/2 results in a single register).
8121    In this case for each copy we get the vector def for the reduction variable
8122    from the vectorized reduction operation generated in the previous iteration.
8123
8124    This only works when we see both the reduction PHI and its only consumer
8125    in vectorizable_reduction and there are no intermediate stmts
8126    participating.  When unrolling we want each unrolled iteration to have its
8127    own reduction accumulator since one of the main goals of unrolling a
8128    reduction is to reduce the aggregate loop-carried latency.  */
8129   if (ncopies > 1
8130       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8131       && reduc_chain_length == 1
8132       && loop_vinfo->suggested_unroll_factor == 1)
8133     single_defuse_cycle = true;
8134
8135   if (single_defuse_cycle || lane_reduc_code_p)
8136     {
8137       gcc_assert (op.code != COND_EXPR);
8138
8139       /* 4. Supportable by target?  */
8140       bool ok = true;
8141
8142       /* 4.1. check support for the operation in the loop
8143
8144          This isn't necessary for the lane reduction codes, since they
8145          can only be produced by pattern matching, and it's up to the
8146          pattern matcher to test for support.  The main reason for
8147          specifically skipping this step is to avoid rechecking whether
8148          mixed-sign dot-products can be implemented using signed
8149          dot-products.  */
8150       machine_mode vec_mode = TYPE_MODE (vectype_in);
8151       if (!lane_reduc_code_p
8152           && !directly_supported_p (op.code, vectype_in, optab_vector))
8153         {
8154           if (dump_enabled_p ())
8155             dump_printf (MSG_NOTE, "op not supported by target.\n");
8156           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8157               || !vect_can_vectorize_without_simd_p (op.code))
8158             ok = false;
8159           else
8160             if (dump_enabled_p ())
8161               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8162         }
8163
8164       if (vect_emulated_vector_p (vectype_in)
8165           && !vect_can_vectorize_without_simd_p (op.code))
8166         {
8167           if (dump_enabled_p ())
8168             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8169           return false;
8170         }
8171
8172       /* lane-reducing operations have to go through vect_transform_reduction.
8173          For the other cases try without the single cycle optimization.  */
8174       if (!ok)
8175         {
8176           if (lane_reduc_code_p)
8177             return false;
8178           else
8179             single_defuse_cycle = false;
8180         }
8181     }
8182   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8183
8184   /* If the reduction stmt is one of the patterns that have lane
8185      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8186   if ((ncopies > 1 && ! single_defuse_cycle)
8187       && lane_reduc_code_p)
8188     {
8189       if (dump_enabled_p ())
8190         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8191                          "multi def-use cycle not possible for lane-reducing "
8192                          "reduction operation\n");
8193       return false;
8194     }
8195
8196   if (slp_node
8197       && !(!single_defuse_cycle
8198            && !lane_reduc_code_p
8199            && reduction_type != FOLD_LEFT_REDUCTION))
8200     for (i = 0; i < (int) op.num_ops; i++)
8201       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8202         {
8203           if (dump_enabled_p ())
8204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8205                              "incompatible vector types for invariants\n");
8206           return false;
8207         }
8208
8209   if (slp_node)
8210     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8211   else
8212     vec_num = 1;
8213
8214   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8215                              reduction_type, ncopies, cost_vec);
8216   /* Cost the reduction op inside the loop if transformed via
8217      vect_transform_reduction.  Otherwise this is costed by the
8218      separate vectorizable_* routines.  */
8219   if (single_defuse_cycle || lane_reduc_code_p)
8220     {
8221       int factor = 1;
8222       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8223         /* Three dot-products and a subtraction.  */
8224         factor = 4;
8225       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8226                         stmt_info, 0, vect_body);
8227     }
8228
8229   if (dump_enabled_p ()
8230       && reduction_type == FOLD_LEFT_REDUCTION)
8231     dump_printf_loc (MSG_NOTE, vect_location,
8232                      "using an in-order (fold-left) reduction.\n");
8233   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8234   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8235      reductions go through their own vectorizable_* routines.  */
8236   if (!single_defuse_cycle
8237       && !lane_reduc_code_p
8238       && reduction_type != FOLD_LEFT_REDUCTION)
8239     {
8240       stmt_vec_info tem
8241         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8242       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8243         {
8244           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8245           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8246         }
8247       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8248       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8249     }
8250   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8251     {
8252       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8253       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8254       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8255
8256       if (reduction_type != FOLD_LEFT_REDUCTION
8257           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8258           && (cond_fn == IFN_LAST
8259               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8260                                                   OPTIMIZE_FOR_SPEED)))
8261         {
8262           if (dump_enabled_p ())
8263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8264                              "can't operate on partial vectors because"
8265                              " no conditional operation is available.\n");
8266           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8267         }
8268       else if (reduction_type == FOLD_LEFT_REDUCTION
8269                && reduc_fn == IFN_LAST
8270                && !expand_vec_cond_expr_p (vectype_in,
8271                                            truth_type_for (vectype_in),
8272                                            SSA_NAME))
8273         {
8274           if (dump_enabled_p ())
8275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8276                              "can't operate on partial vectors because"
8277                              " no conditional operation is available.\n");
8278           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8279         }
8280       else if (reduction_type == FOLD_LEFT_REDUCTION
8281                && internal_fn_mask_index (reduc_fn) == -1
8282                && FLOAT_TYPE_P (vectype_in)
8283                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8284         {
8285           if (dump_enabled_p ())
8286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8287                              "can't operate on partial vectors because"
8288                              " signed zeros cannot be preserved.\n");
8289           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8290         }
8291       else
8292         {
8293           internal_fn mask_reduc_fn
8294             = get_masked_reduction_fn (reduc_fn, vectype_in);
8295
8296           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8297             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8298                                   vectype_in, 1);
8299           else
8300             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8301                                    vectype_in, NULL);
8302         }
8303     }
8304   return true;
8305 }
8306
8307 /* STMT_INFO is a dot-product reduction whose multiplication operands
8308    have different signs.  Emit a sequence to emulate the operation
8309    using a series of signed DOT_PROD_EXPRs and return the last
8310    statement generated.  VEC_DEST is the result of the vector operation
8311    and VOP lists its inputs.  */
8312
8313 static gassign *
8314 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8315                              gimple_stmt_iterator *gsi, tree vec_dest,
8316                              tree vop[3])
8317 {
8318   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8319   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8320   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8321   gimple *new_stmt;
8322
8323   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8324   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8325     std::swap (vop[0], vop[1]);
8326
8327   /* Convert all inputs to signed types.  */
8328   for (int i = 0; i < 3; ++i)
8329     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8330       {
8331         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8332         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8333         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8334         vop[i] = tmp;
8335       }
8336
8337   /* In the comments below we assume 8-bit inputs for simplicity,
8338      but the approach works for any full integer type.  */
8339
8340   /* Create a vector of -128.  */
8341   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8342   tree min_narrow = build_vector_from_val (narrow_vectype,
8343                                            min_narrow_elttype);
8344
8345   /* Create a vector of 64.  */
8346   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8347   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8348   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8349
8350   /* Emit: SUB_RES = VOP[0] - 128.  */
8351   tree sub_res = make_ssa_name (narrow_vectype);
8352   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8353   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8354
8355   /* Emit:
8356
8357        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8358        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8359        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8360
8361      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8362      Doing the two 64 * y steps first allows more time to compute x.  */
8363   tree stage1 = make_ssa_name (wide_vectype);
8364   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8365                                   vop[1], half_narrow, vop[2]);
8366   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8367
8368   tree stage2 = make_ssa_name (wide_vectype);
8369   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8370                                   vop[1], half_narrow, stage1);
8371   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8372
8373   tree stage3 = make_ssa_name (wide_vectype);
8374   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8375                                   sub_res, vop[1], stage2);
8376   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8377
8378   /* Convert STAGE3 to the reduction type.  */
8379   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8380 }
8381
8382 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8383    value.  */
8384
8385 bool
8386 vect_transform_reduction (loop_vec_info loop_vinfo,
8387                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8388                           gimple **vec_stmt, slp_tree slp_node)
8389 {
8390   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8391   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8392   int i;
8393   int ncopies;
8394   int vec_num;
8395
8396   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8397   gcc_assert (reduc_info->is_reduc_info);
8398
8399   if (nested_in_vect_loop_p (loop, stmt_info))
8400     {
8401       loop = loop->inner;
8402       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8403     }
8404
8405   gimple_match_op op;
8406   if (!gimple_extract_op (stmt_info->stmt, &op))
8407     gcc_unreachable ();
8408
8409   /* All uses but the last are expected to be defined in the loop.
8410      The last use is the reduction variable.  In case of nested cycle this
8411      assumption is not true: we use reduc_index to record the index of the
8412      reduction variable.  */
8413   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8414   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8415   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8416   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8417
8418   if (slp_node)
8419     {
8420       ncopies = 1;
8421       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8422     }
8423   else
8424     {
8425       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8426       vec_num = 1;
8427     }
8428
8429   code_helper code = canonicalize_code (op.code, op.type);
8430   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8431
8432   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8433   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8434   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8435
8436   /* Transform.  */
8437   tree new_temp = NULL_TREE;
8438   auto_vec<tree> vec_oprnds0;
8439   auto_vec<tree> vec_oprnds1;
8440   auto_vec<tree> vec_oprnds2;
8441   tree def0;
8442
8443   if (dump_enabled_p ())
8444     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8445
8446   /* FORNOW: Multiple types are not supported for condition.  */
8447   if (code == COND_EXPR)
8448     gcc_assert (ncopies == 1);
8449
8450   /* A binary COND_OP reduction must have the same definition and else
8451      value. */
8452   bool cond_fn_p = code.is_internal_fn ()
8453     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8454   if (cond_fn_p)
8455     {
8456       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8457                   || code == IFN_COND_MUL || code == IFN_COND_AND
8458                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8459       gcc_assert (op.num_ops == 4
8460                   && (op.ops[reduc_index]
8461                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8462     }
8463
8464   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8465
8466   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8467   if (reduction_type == FOLD_LEFT_REDUCTION)
8468     {
8469       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8470       gcc_assert (code.is_tree_code () || cond_fn_p);
8471       return vectorize_fold_left_reduction
8472           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8473            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8474            reduc_index, masks, lens);
8475     }
8476
8477   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8478   gcc_assert (single_defuse_cycle
8479               || code == DOT_PROD_EXPR
8480               || code == WIDEN_SUM_EXPR
8481               || code == SAD_EXPR);
8482
8483   /* Create the destination vector  */
8484   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8485   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8486
8487   /* Get NCOPIES vector definitions for all operands except the reduction
8488      definition.  */
8489   if (!cond_fn_p)
8490     {
8491       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8492                          single_defuse_cycle && reduc_index == 0
8493                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8494                          single_defuse_cycle && reduc_index == 1
8495                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8496                          op.num_ops == 3
8497                          && !(single_defuse_cycle && reduc_index == 2)
8498                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8499     }
8500   else
8501     {
8502       /* For a conditional operation pass the truth type as mask
8503          vectype.  */
8504       gcc_assert (single_defuse_cycle
8505                   && (reduc_index == 1 || reduc_index == 2));
8506       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8507                          op.ops[0], &vec_oprnds0, truth_type_for (vectype_in),
8508                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8509                          &vec_oprnds1, NULL_TREE,
8510                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8511                          &vec_oprnds2, NULL_TREE);
8512     }
8513
8514   /* For single def-use cycles get one copy of the vectorized reduction
8515      definition.  */
8516   if (single_defuse_cycle)
8517     {
8518       gcc_assert (!slp_node);
8519       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8520                                      op.ops[reduc_index],
8521                                      reduc_index == 0 ? &vec_oprnds0
8522                                      : (reduc_index == 1 ? &vec_oprnds1
8523                                         : &vec_oprnds2));
8524     }
8525
8526   bool emulated_mixed_dot_prod
8527     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8528   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8529     {
8530       gimple *new_stmt;
8531       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8532       if (masked_loop_p && !mask_by_cond_expr)
8533         {
8534           /* No conditional ifns have been defined for dot-product yet.  */
8535           gcc_assert (code != DOT_PROD_EXPR);
8536
8537           /* Make sure that the reduction accumulator is vop[0].  */
8538           if (reduc_index == 1)
8539             {
8540               gcc_assert (commutative_binary_op_p (code, op.type));
8541               std::swap (vop[0], vop[1]);
8542             }
8543           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8544                                           vec_num * ncopies, vectype_in, i);
8545           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8546                                                     vop[0], vop[1], vop[0]);
8547           new_temp = make_ssa_name (vec_dest, call);
8548           gimple_call_set_lhs (call, new_temp);
8549           gimple_call_set_nothrow (call, true);
8550           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8551           new_stmt = call;
8552         }
8553       else
8554         {
8555           if (op.num_ops >= 3)
8556             vop[2] = vec_oprnds2[i];
8557
8558           if (masked_loop_p && mask_by_cond_expr)
8559             {
8560               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8561                                               vec_num * ncopies, vectype_in, i);
8562               build_vect_cond_expr (code, vop, mask, gsi);
8563             }
8564
8565           if (emulated_mixed_dot_prod)
8566             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8567                                                     vec_dest, vop);
8568
8569           else if (code.is_internal_fn () && !cond_fn_p)
8570             new_stmt = gimple_build_call_internal (internal_fn (code),
8571                                                    op.num_ops,
8572                                                    vop[0], vop[1], vop[2]);
8573           else if (code.is_internal_fn () && cond_fn_p)
8574             new_stmt = gimple_build_call_internal (internal_fn (code),
8575                                                    op.num_ops,
8576                                                    vop[0], vop[1], vop[2],
8577                                                    vop[1]);
8578           else
8579             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8580                                             vop[0], vop[1], vop[2]);
8581           new_temp = make_ssa_name (vec_dest, new_stmt);
8582           gimple_set_lhs (new_stmt, new_temp);
8583           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8584         }
8585
8586       if (slp_node)
8587         slp_node->push_vec_def (new_stmt);
8588       else if (single_defuse_cycle
8589                && i < ncopies - 1)
8590         {
8591           if (reduc_index == 0)
8592             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8593           else if (reduc_index == 1)
8594             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8595           else if (reduc_index == 2)
8596             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8597         }
8598       else
8599         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8600     }
8601
8602   if (!slp_node)
8603     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8604
8605   return true;
8606 }
8607
8608 /* Transform phase of a cycle PHI.  */
8609
8610 bool
8611 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8612                           stmt_vec_info stmt_info, gimple **vec_stmt,
8613                           slp_tree slp_node, slp_instance slp_node_instance)
8614 {
8615   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8616   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8617   int i;
8618   int ncopies;
8619   int j;
8620   bool nested_cycle = false;
8621   int vec_num;
8622
8623   if (nested_in_vect_loop_p (loop, stmt_info))
8624     {
8625       loop = loop->inner;
8626       nested_cycle = true;
8627     }
8628
8629   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8630   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8631   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8632   gcc_assert (reduc_info->is_reduc_info);
8633
8634   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8635       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8636     /* Leave the scalar phi in place.  */
8637     return true;
8638
8639   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8640   /* For a nested cycle we do not fill the above.  */
8641   if (!vectype_in)
8642     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8643   gcc_assert (vectype_in);
8644
8645   if (slp_node)
8646     {
8647       /* The size vect_schedule_slp_instance computes is off for us.  */
8648       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8649                                       * SLP_TREE_LANES (slp_node), vectype_in);
8650       ncopies = 1;
8651     }
8652   else
8653     {
8654       vec_num = 1;
8655       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8656     }
8657
8658   /* Check whether we should use a single PHI node and accumulate
8659      vectors to one before the backedge.  */
8660   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8661     ncopies = 1;
8662
8663   /* Create the destination vector  */
8664   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8665   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8666                                                vectype_out);
8667
8668   /* Get the loop-entry arguments.  */
8669   tree vec_initial_def = NULL_TREE;
8670   auto_vec<tree> vec_initial_defs;
8671   if (slp_node)
8672     {
8673       vec_initial_defs.reserve (vec_num);
8674       if (nested_cycle)
8675         {
8676           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8677           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8678                              &vec_initial_defs);
8679         }
8680       else
8681         {
8682           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8683           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8684           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8685
8686           unsigned int num_phis = stmts.length ();
8687           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8688             num_phis = 1;
8689           initial_values.reserve (num_phis);
8690           for (unsigned int i = 0; i < num_phis; ++i)
8691             {
8692               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8693               initial_values.quick_push (vect_phi_initial_value (this_phi));
8694             }
8695           if (vec_num == 1)
8696             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8697           if (!initial_values.is_empty ())
8698             {
8699               tree initial_value
8700                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8701               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8702               tree neutral_op
8703                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8704                                             code, initial_value);
8705               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8706                                               &vec_initial_defs, vec_num,
8707                                               stmts.length (), neutral_op);
8708             }
8709         }
8710     }
8711   else
8712     {
8713       /* Get at the scalar def before the loop, that defines the initial
8714          value of the reduction variable.  */
8715       tree initial_def = vect_phi_initial_value (phi);
8716       reduc_info->reduc_initial_values.safe_push (initial_def);
8717       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8718          and we can't use zero for induc_val, use initial_def.  Similarly
8719          for REDUC_MIN and initial_def larger than the base.  */
8720       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8721         {
8722           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8723           if (TREE_CODE (initial_def) == INTEGER_CST
8724               && !integer_zerop (induc_val)
8725               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8726                    && tree_int_cst_lt (initial_def, induc_val))
8727                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8728                       && tree_int_cst_lt (induc_val, initial_def))))
8729             {
8730               induc_val = initial_def;
8731               /* Communicate we used the initial_def to epilouge
8732                  generation.  */
8733               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8734             }
8735           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8736         }
8737       else if (nested_cycle)
8738         {
8739           /* Do not use an adjustment def as that case is not supported
8740              correctly if ncopies is not one.  */
8741           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8742                                          ncopies, initial_def,
8743                                          &vec_initial_defs);
8744         }
8745       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8746                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8747         /* Fill the initial vector with the initial scalar value.  */
8748         vec_initial_def
8749           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8750                                            initial_def, initial_def);
8751       else
8752         {
8753           if (ncopies == 1)
8754             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8755           if (!reduc_info->reduc_initial_values.is_empty ())
8756             {
8757               initial_def = reduc_info->reduc_initial_values[0];
8758               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8759               tree neutral_op
8760                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8761                                             code, initial_def);
8762               gcc_assert (neutral_op);
8763               /* Try to simplify the vector initialization by applying an
8764                  adjustment after the reduction has been performed.  */
8765               if (!reduc_info->reused_accumulator
8766                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8767                   && !operand_equal_p (neutral_op, initial_def))
8768                 {
8769                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8770                     = initial_def;
8771                   initial_def = neutral_op;
8772                 }
8773               vec_initial_def
8774                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8775                                                  initial_def, neutral_op);
8776             }
8777         }
8778     }
8779
8780   if (vec_initial_def)
8781     {
8782       vec_initial_defs.create (ncopies);
8783       for (i = 0; i < ncopies; ++i)
8784         vec_initial_defs.quick_push (vec_initial_def);
8785     }
8786
8787   if (auto *accumulator = reduc_info->reused_accumulator)
8788     {
8789       tree def = accumulator->reduc_input;
8790       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8791         {
8792           unsigned int nreduc;
8793           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8794                                             (TREE_TYPE (def)),
8795                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8796                                           &nreduc);
8797           gcc_assert (res);
8798           gimple_seq stmts = NULL;
8799           /* Reduce the single vector to a smaller one.  */
8800           if (nreduc != 1)
8801             {
8802               /* Perform the reduction in the appropriate type.  */
8803               tree rvectype = vectype_out;
8804               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8805                                               TREE_TYPE (TREE_TYPE (def))))
8806                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8807                                               TYPE_VECTOR_SUBPARTS
8808                                                 (vectype_out));
8809               def = vect_create_partial_epilog (def, rvectype,
8810                                                 STMT_VINFO_REDUC_CODE
8811                                                   (reduc_info),
8812                                                 &stmts);
8813             }
8814           /* The epilogue loop might use a different vector mode, like
8815              VNx2DI vs. V2DI.  */
8816           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8817             {
8818               tree reduc_type = build_vector_type_for_mode
8819                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8820               def = gimple_convert (&stmts, reduc_type, def);
8821             }
8822           /* Adjust the input so we pick up the partially reduced value
8823              for the skip edge in vect_create_epilog_for_reduction.  */
8824           accumulator->reduc_input = def;
8825           /* And the reduction could be carried out using a different sign.  */
8826           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8827             def = gimple_convert (&stmts, vectype_out, def);
8828           if (loop_vinfo->main_loop_edge)
8829             {
8830               /* While we'd like to insert on the edge this will split
8831                  blocks and disturb bookkeeping, we also will eventually
8832                  need this on the skip edge.  Rely on sinking to
8833                  fixup optimal placement and insert in the pred.  */
8834               gimple_stmt_iterator gsi
8835                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8836               /* Insert before a cond that eventually skips the
8837                  epilogue.  */
8838               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8839                 gsi_prev (&gsi);
8840               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8841             }
8842           else
8843             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8844                                               stmts);
8845         }
8846       if (loop_vinfo->main_loop_edge)
8847         vec_initial_defs[0]
8848           = vect_get_main_loop_result (loop_vinfo, def,
8849                                        vec_initial_defs[0]);
8850       else
8851         vec_initial_defs.safe_push (def);
8852     }
8853
8854   /* Generate the reduction PHIs upfront.  */
8855   for (i = 0; i < vec_num; i++)
8856     {
8857       tree vec_init_def = vec_initial_defs[i];
8858       for (j = 0; j < ncopies; j++)
8859         {
8860           /* Create the reduction-phi that defines the reduction
8861              operand.  */
8862           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8863
8864           /* Set the loop-entry arg of the reduction-phi.  */
8865           if (j != 0 && nested_cycle)
8866             vec_init_def = vec_initial_defs[j];
8867           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8868                        UNKNOWN_LOCATION);
8869
8870           /* The loop-latch arg is set in epilogue processing.  */
8871
8872           if (slp_node)
8873             slp_node->push_vec_def (new_phi);
8874           else
8875             {
8876               if (j == 0)
8877                 *vec_stmt = new_phi;
8878               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8879             }
8880         }
8881     }
8882
8883   return true;
8884 }
8885
8886 /* Vectorizes LC PHIs.  */
8887
8888 bool
8889 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8890                      stmt_vec_info stmt_info, gimple **vec_stmt,
8891                      slp_tree slp_node)
8892 {
8893   if (!loop_vinfo
8894       || !is_a <gphi *> (stmt_info->stmt)
8895       || gimple_phi_num_args (stmt_info->stmt) != 1)
8896     return false;
8897
8898   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8899       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8900     return false;
8901
8902   if (!vec_stmt) /* transformation not required.  */
8903     {
8904       /* Deal with copies from externs or constants that disguise as
8905          loop-closed PHI nodes (PR97886).  */
8906       if (slp_node
8907           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8908                                                 SLP_TREE_VECTYPE (slp_node)))
8909         {
8910           if (dump_enabled_p ())
8911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8912                              "incompatible vector types for invariants\n");
8913           return false;
8914         }
8915       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8916       return true;
8917     }
8918
8919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8920   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8921   basic_block bb = gimple_bb (stmt_info->stmt);
8922   edge e = single_pred_edge (bb);
8923   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8924   auto_vec<tree> vec_oprnds;
8925   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8926                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8927                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8928   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8929     {
8930       /* Create the vectorized LC PHI node.  */
8931       gphi *new_phi = create_phi_node (vec_dest, bb);
8932       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8933       if (slp_node)
8934         slp_node->push_vec_def (new_phi);
8935       else
8936         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8937     }
8938   if (!slp_node)
8939     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8940
8941   return true;
8942 }
8943
8944 /* Vectorizes PHIs.  */
8945
8946 bool
8947 vectorizable_phi (vec_info *,
8948                   stmt_vec_info stmt_info, gimple **vec_stmt,
8949                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8950 {
8951   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8952     return false;
8953
8954   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8955     return false;
8956
8957   tree vectype = SLP_TREE_VECTYPE (slp_node);
8958
8959   if (!vec_stmt) /* transformation not required.  */
8960     {
8961       slp_tree child;
8962       unsigned i;
8963       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8964         if (!child)
8965           {
8966             if (dump_enabled_p ())
8967               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8968                                "PHI node with unvectorized backedge def\n");
8969             return false;
8970           }
8971         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8972           {
8973             if (dump_enabled_p ())
8974               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8975                                "incompatible vector types for invariants\n");
8976             return false;
8977           }
8978         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8979                  && !useless_type_conversion_p (vectype,
8980                                                 SLP_TREE_VECTYPE (child)))
8981           {
8982             /* With bools we can have mask and non-mask precision vectors
8983                or different non-mask precisions.  while pattern recog is
8984                supposed to guarantee consistency here bugs in it can cause
8985                mismatches (PR103489 and PR103800 for example).
8986                Deal with them here instead of ICEing later.  */
8987             if (dump_enabled_p ())
8988               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8989                                "incompatible vector type setup from "
8990                                "bool pattern detection\n");
8991             return false;
8992           }
8993
8994       /* For single-argument PHIs assume coalescing which means zero cost
8995          for the scalar and the vector PHIs.  This avoids artificially
8996          favoring the vector path (but may pessimize it in some cases).  */
8997       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8998         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8999                           vector_stmt, stmt_info, vectype, 0, vect_body);
9000       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9001       return true;
9002     }
9003
9004   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9005   basic_block bb = gimple_bb (stmt_info->stmt);
9006   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9007   auto_vec<gphi *> new_phis;
9008   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9009     {
9010       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9011
9012       /* Skip not yet vectorized defs.  */
9013       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9014           && SLP_TREE_VEC_DEFS (child).is_empty ())
9015         continue;
9016
9017       auto_vec<tree> vec_oprnds;
9018       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9019       if (!new_phis.exists ())
9020         {
9021           new_phis.create (vec_oprnds.length ());
9022           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9023             {
9024               /* Create the vectorized LC PHI node.  */
9025               new_phis.quick_push (create_phi_node (vec_dest, bb));
9026               slp_node->push_vec_def (new_phis[j]);
9027             }
9028         }
9029       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9030       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9031         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9032     }
9033   /* We should have at least one already vectorized child.  */
9034   gcc_assert (new_phis.exists ());
9035
9036   return true;
9037 }
9038
9039 /* Vectorizes first order recurrences.  An overview of the transformation
9040    is described below. Suppose we have the following loop.
9041
9042      int t = 0;
9043      for (int i = 0; i < n; ++i)
9044        {
9045          b[i] = a[i] - t;
9046          t = a[i];
9047        }
9048
9049    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9050    looks (simplified) like:
9051
9052     scalar.preheader:
9053       init = 0;
9054
9055     scalar.body:
9056       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9057       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9058       _1 = a[i]
9059       b[i] = _1 - _2
9060       if (i < n) goto scalar.body
9061
9062    In this example, _2 is a recurrence because it's value depends on the
9063    previous iteration.  We vectorize this as (VF = 4)
9064
9065     vector.preheader:
9066       vect_init = vect_cst(..., ..., ..., 0)
9067
9068     vector.body
9069       i = PHI <0(vector.preheader), i+4(vector.body)>
9070       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9071       vect_2 = a[i, i+1, i+2, i+3];
9072       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9073       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9074       if (..) goto vector.body
9075
9076    In this function, vectorizable_recurr, we code generate both the
9077    vector PHI node and the permute since those together compute the
9078    vectorized value of the scalar PHI.  We do not yet have the
9079    backedge value to fill in there nor into the vec_perm.  Those
9080    are filled in maybe_set_vectorized_backedge_value and
9081    vect_schedule_scc.
9082
9083    TODO:  Since the scalar loop does not have a use of the recurrence
9084    outside of the loop the natural way to implement peeling via
9085    vectorizing the live value doesn't work.  For now peeling of loops
9086    with a recurrence is not implemented.  For SLP the supported cases
9087    are restricted to those requiring a single vector recurrence PHI.  */
9088
9089 bool
9090 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9091                      gimple **vec_stmt, slp_tree slp_node,
9092                      stmt_vector_for_cost *cost_vec)
9093 {
9094   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9095     return false;
9096
9097   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9098
9099   /* So far we only support first-order recurrence auto-vectorization.  */
9100   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9101     return false;
9102
9103   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9104   unsigned ncopies;
9105   if (slp_node)
9106     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9107   else
9108     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9109   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9110   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9111   /* We need to be able to make progress with a single vector.  */
9112   if (maybe_gt (dist * 2, nunits))
9113     {
9114       if (dump_enabled_p ())
9115         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9116                          "first order recurrence exceeds half of "
9117                          "a vector\n");
9118       return false;
9119     }
9120
9121   /* First-order recurrence autovectorization needs to handle permutation
9122      with indices = [nunits-1, nunits, nunits+1, ...].  */
9123   vec_perm_builder sel (nunits, 1, 3);
9124   for (int i = 0; i < 3; ++i)
9125     sel.quick_push (nunits - dist + i);
9126   vec_perm_indices indices (sel, 2, nunits);
9127
9128   if (!vec_stmt) /* transformation not required.  */
9129     {
9130       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9131                                  indices))
9132         return false;
9133
9134       if (slp_node)
9135         {
9136           /* We eventually need to set a vector type on invariant
9137              arguments.  */
9138           unsigned j;
9139           slp_tree child;
9140           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9141             if (!vect_maybe_update_slp_op_vectype
9142                   (child, SLP_TREE_VECTYPE (slp_node)))
9143               {
9144                 if (dump_enabled_p ())
9145                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9146                                    "incompatible vector types for "
9147                                    "invariants\n");
9148                 return false;
9149               }
9150         }
9151       /* The recurrence costs the initialization vector and one permute
9152          for each copy.  */
9153       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9154                                                  stmt_info, 0, vect_prologue);
9155       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9156                                                stmt_info, 0, vect_body);
9157       if (dump_enabled_p ())
9158         dump_printf_loc (MSG_NOTE, vect_location,
9159                          "vectorizable_recurr: inside_cost = %d, "
9160                          "prologue_cost = %d .\n", inside_cost,
9161                          prologue_cost);
9162
9163       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9164       return true;
9165     }
9166
9167   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9168   basic_block bb = gimple_bb (phi);
9169   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9170   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9171     {
9172       gimple_seq stmts = NULL;
9173       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9174       gsi_insert_seq_on_edge_immediate (pe, stmts);
9175     }
9176   tree vec_init = build_vector_from_val (vectype, preheader);
9177   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9178
9179   /* Create the vectorized first-order PHI node.  */
9180   tree vec_dest = vect_get_new_vect_var (vectype,
9181                                          vect_simple_var, "vec_recur_");
9182   gphi *new_phi = create_phi_node (vec_dest, bb);
9183   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9184
9185   /* Insert shuffles the first-order recurrence autovectorization.
9186        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9187   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9188
9189   /* Insert the required permute after the latch definition.  The
9190      second and later operands are tentative and will be updated when we have
9191      vectorized the latch definition.  */
9192   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9193   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9194   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9195   gsi_next (&gsi2);
9196
9197   for (unsigned i = 0; i < ncopies; ++i)
9198     {
9199       vec_dest = make_ssa_name (vectype);
9200       gassign *vperm
9201           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9202                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9203                                  NULL, perm);
9204       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9205
9206       if (slp_node)
9207         slp_node->push_vec_def (vperm);
9208       else
9209         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9210     }
9211
9212   if (!slp_node)
9213     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9214   return true;
9215 }
9216
9217 /* Return true if VECTYPE represents a vector that requires lowering
9218    by the vector lowering pass.  */
9219
9220 bool
9221 vect_emulated_vector_p (tree vectype)
9222 {
9223   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9224           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9225               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9226 }
9227
9228 /* Return true if we can emulate CODE on an integer mode representation
9229    of a vector.  */
9230
9231 bool
9232 vect_can_vectorize_without_simd_p (tree_code code)
9233 {
9234   switch (code)
9235     {
9236     case PLUS_EXPR:
9237     case MINUS_EXPR:
9238     case NEGATE_EXPR:
9239     case BIT_AND_EXPR:
9240     case BIT_IOR_EXPR:
9241     case BIT_XOR_EXPR:
9242     case BIT_NOT_EXPR:
9243       return true;
9244
9245     default:
9246       return false;
9247     }
9248 }
9249
9250 /* Likewise, but taking a code_helper.  */
9251
9252 bool
9253 vect_can_vectorize_without_simd_p (code_helper code)
9254 {
9255   return (code.is_tree_code ()
9256           && vect_can_vectorize_without_simd_p (tree_code (code)));
9257 }
9258
9259 /* Create vector init for vectorized iv.  */
9260 static tree
9261 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9262                                tree step_expr, poly_uint64 nunits,
9263                                tree vectype,
9264                                enum vect_induction_op_type induction_type)
9265 {
9266   unsigned HOST_WIDE_INT const_nunits;
9267   tree vec_shift, vec_init, new_name;
9268   unsigned i;
9269   tree itype = TREE_TYPE (vectype);
9270
9271   /* iv_loop is the loop to be vectorized. Create:
9272      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9273   new_name = gimple_convert (stmts, itype, init_expr);
9274   switch (induction_type)
9275     {
9276     case vect_step_op_shr:
9277     case vect_step_op_shl:
9278       /* Build the Initial value from shift_expr.  */
9279       vec_init = gimple_build_vector_from_val (stmts,
9280                                                vectype,
9281                                                new_name);
9282       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9283                                 build_zero_cst (itype), step_expr);
9284       vec_init = gimple_build (stmts,
9285                                (induction_type == vect_step_op_shr
9286                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9287                                vectype, vec_init, vec_shift);
9288       break;
9289
9290     case vect_step_op_neg:
9291       {
9292         vec_init = gimple_build_vector_from_val (stmts,
9293                                                  vectype,
9294                                                  new_name);
9295         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9296                                      vectype, vec_init);
9297         /* The encoding has 2 interleaved stepped patterns.  */
9298         vec_perm_builder sel (nunits, 2, 3);
9299         sel.quick_grow (6);
9300         for (i = 0; i < 3; i++)
9301           {
9302             sel[2 * i] = i;
9303             sel[2 * i + 1] = i + nunits;
9304           }
9305         vec_perm_indices indices (sel, 2, nunits);
9306         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9307            fail when vec_init is const vector. In that situation vec_perm is not
9308            really needed.  */
9309         tree perm_mask_even
9310           = vect_gen_perm_mask_any (vectype, indices);
9311         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9312                                  vectype,
9313                                  vec_init, vec_neg,
9314                                  perm_mask_even);
9315       }
9316       break;
9317
9318     case vect_step_op_mul:
9319       {
9320         /* Use unsigned mult to avoid UD integer overflow.  */
9321         gcc_assert (nunits.is_constant (&const_nunits));
9322         tree utype = unsigned_type_for (itype);
9323         tree uvectype = build_vector_type (utype,
9324                                            TYPE_VECTOR_SUBPARTS (vectype));
9325         new_name = gimple_convert (stmts, utype, new_name);
9326         vec_init = gimple_build_vector_from_val (stmts,
9327                                                  uvectype,
9328                                                  new_name);
9329         tree_vector_builder elts (uvectype, const_nunits, 1);
9330         tree elt_step = build_one_cst (utype);
9331
9332         elts.quick_push (elt_step);
9333         for (i = 1; i < const_nunits; i++)
9334           {
9335             /* Create: new_name_i = new_name + step_expr.  */
9336             elt_step = gimple_build (stmts, MULT_EXPR,
9337                                      utype, elt_step, step_expr);
9338             elts.quick_push (elt_step);
9339           }
9340         /* Create a vector from [new_name_0, new_name_1, ...,
9341            new_name_nunits-1].  */
9342         tree vec_mul = gimple_build_vector (stmts, &elts);
9343         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9344                                  vec_init, vec_mul);
9345         vec_init = gimple_convert (stmts, vectype, vec_init);
9346       }
9347       break;
9348
9349     default:
9350       gcc_unreachable ();
9351     }
9352
9353   return vec_init;
9354 }
9355
9356 /* Peel init_expr by skip_niter for induction_type.  */
9357 tree
9358 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9359                              tree skip_niters, tree step_expr,
9360                              enum vect_induction_op_type induction_type)
9361 {
9362   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9363   tree type = TREE_TYPE (init_expr);
9364   unsigned prec = TYPE_PRECISION (type);
9365   switch (induction_type)
9366     {
9367     case vect_step_op_neg:
9368       if (TREE_INT_CST_LOW (skip_niters) % 2)
9369         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9370       /* else no change.  */
9371       break;
9372
9373     case vect_step_op_shr:
9374     case vect_step_op_shl:
9375       skip_niters = gimple_convert (stmts, type, skip_niters);
9376       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9377       /* When shift mount >= precision, need to avoid UD.
9378          In the original loop, there's no UD, and according to semantic,
9379          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9380       if (!tree_fits_uhwi_p (step_expr)
9381           || tree_to_uhwi (step_expr) >= prec)
9382         {
9383           if (induction_type == vect_step_op_shl
9384               || TYPE_UNSIGNED (type))
9385             init_expr = build_zero_cst (type);
9386           else
9387             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9388                                       init_expr,
9389                                       wide_int_to_tree (type, prec - 1));
9390         }
9391       else
9392         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9393                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9394                                   type, init_expr, step_expr);
9395       break;
9396
9397     case vect_step_op_mul:
9398       {
9399         tree utype = unsigned_type_for (type);
9400         init_expr = gimple_convert (stmts, utype, init_expr);
9401         wide_int skipn = wi::to_wide (skip_niters);
9402         wide_int begin = wi::to_wide (step_expr);
9403         auto_mpz base, exp, mod, res;
9404         wi::to_mpz (begin, base, TYPE_SIGN (type));
9405         wi::to_mpz (skipn, exp, UNSIGNED);
9406         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9407         mpz_powm (res, base, exp, mod);
9408         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9409         tree mult_expr = wide_int_to_tree (utype, begin);
9410         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9411                                   init_expr, mult_expr);
9412         init_expr = gimple_convert (stmts, type, init_expr);
9413       }
9414       break;
9415
9416     default:
9417       gcc_unreachable ();
9418     }
9419
9420   return init_expr;
9421 }
9422
9423 /* Create vector step for vectorized iv.  */
9424 static tree
9425 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9426                                poly_uint64 vf,
9427                                enum vect_induction_op_type induction_type)
9428 {
9429   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9430   tree new_name = NULL;
9431   /* Step should be pow (step, vf) for mult induction.  */
9432   if (induction_type == vect_step_op_mul)
9433     {
9434       gcc_assert (vf.is_constant ());
9435       wide_int begin = wi::to_wide (step_expr);
9436
9437       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9438         begin = wi::mul (begin, wi::to_wide (step_expr));
9439
9440       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9441     }
9442   else if (induction_type == vect_step_op_neg)
9443     /* Do nothing.  */
9444     ;
9445   else
9446     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9447                              expr, step_expr);
9448   return new_name;
9449 }
9450
9451 static tree
9452 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9453                                    stmt_vec_info stmt_info,
9454                                    tree new_name, tree vectype,
9455                                    enum vect_induction_op_type induction_type)
9456 {
9457   /* No step is needed for neg induction.  */
9458   if (induction_type == vect_step_op_neg)
9459     return NULL;
9460
9461   tree t = unshare_expr (new_name);
9462   gcc_assert (CONSTANT_CLASS_P (new_name)
9463               || TREE_CODE (new_name) == SSA_NAME);
9464   tree new_vec = build_vector_from_val (vectype, t);
9465   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9466                                     new_vec, vectype, NULL);
9467   return vec_step;
9468 }
9469
9470 /* Update vectorized iv with vect_step, induc_def is init.  */
9471 static tree
9472 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9473                           tree induc_def, tree vec_step,
9474                           enum vect_induction_op_type induction_type)
9475 {
9476   tree vec_def = induc_def;
9477   switch (induction_type)
9478     {
9479     case vect_step_op_mul:
9480       {
9481         /* Use unsigned mult to avoid UD integer overflow.  */
9482         tree uvectype
9483           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9484                                TYPE_VECTOR_SUBPARTS (vectype));
9485         vec_def = gimple_convert (stmts, uvectype, vec_def);
9486         vec_step = gimple_convert (stmts, uvectype, vec_step);
9487         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9488                                 vec_def, vec_step);
9489         vec_def = gimple_convert (stmts, vectype, vec_def);
9490       }
9491       break;
9492
9493     case vect_step_op_shr:
9494       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9495                               vec_def, vec_step);
9496       break;
9497
9498     case vect_step_op_shl:
9499       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9500                               vec_def, vec_step);
9501       break;
9502     case vect_step_op_neg:
9503       vec_def = induc_def;
9504       /* Do nothing.  */
9505       break;
9506     default:
9507       gcc_unreachable ();
9508     }
9509
9510   return vec_def;
9511
9512 }
9513
9514 /* Function vectorizable_induction
9515
9516    Check if STMT_INFO performs an nonlinear induction computation that can be
9517    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9518    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9519    basic block.
9520    Return true if STMT_INFO is vectorizable in this way.  */
9521
9522 static bool
9523 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9524                                   stmt_vec_info stmt_info,
9525                                   gimple **vec_stmt, slp_tree slp_node,
9526                                   stmt_vector_for_cost *cost_vec)
9527 {
9528   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9529   unsigned ncopies;
9530   bool nested_in_vect_loop = false;
9531   class loop *iv_loop;
9532   tree vec_def;
9533   edge pe = loop_preheader_edge (loop);
9534   basic_block new_bb;
9535   tree vec_init, vec_step;
9536   tree new_name;
9537   gimple *new_stmt;
9538   gphi *induction_phi;
9539   tree induc_def, vec_dest;
9540   tree init_expr, step_expr;
9541   tree niters_skip;
9542   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9543   unsigned i;
9544   gimple_stmt_iterator si;
9545
9546   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9547
9548   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9549   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9550   enum vect_induction_op_type induction_type
9551     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9552
9553   gcc_assert (induction_type > vect_step_op_add);
9554
9555   if (slp_node)
9556     ncopies = 1;
9557   else
9558     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9559   gcc_assert (ncopies >= 1);
9560
9561   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9562   if (nested_in_vect_loop_p (loop, stmt_info))
9563     {
9564       if (dump_enabled_p ())
9565         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9566                          "nonlinear induction in nested loop.\n");
9567       return false;
9568     }
9569
9570   iv_loop = loop;
9571   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9572
9573   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9574      update for each iv and a permutation to generate wanted vector iv.  */
9575   if (slp_node)
9576     {
9577       if (dump_enabled_p ())
9578         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9579                          "SLP induction not supported for nonlinear"
9580                          " induction.\n");
9581       return false;
9582     }
9583
9584   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9585     {
9586       if (dump_enabled_p ())
9587         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9588                          "floating point nonlinear induction vectorization"
9589                          " not supported.\n");
9590       return false;
9591     }
9592
9593   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9594   init_expr = vect_phi_initial_value (phi);
9595   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9596               && TREE_CODE (step_expr) == INTEGER_CST);
9597   /* step_expr should be aligned with init_expr,
9598      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9599   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9600
9601   if (TREE_CODE (init_expr) == INTEGER_CST)
9602     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9603   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9604     {
9605       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9606       if (dump_enabled_p ())
9607         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9608                          "nonlinear induction vectorization failed:"
9609                          " component type of vectype is not a nop conversion"
9610                          " from type of init_expr.\n");
9611       return false;
9612     }
9613
9614   switch (induction_type)
9615     {
9616     case vect_step_op_neg:
9617       if (TREE_CODE (init_expr) != INTEGER_CST
9618           && TREE_CODE (init_expr) != REAL_CST)
9619         {
9620           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9621           if (!directly_supported_p (NEGATE_EXPR, vectype))
9622             return false;
9623
9624           /* The encoding has 2 interleaved stepped patterns.  */
9625           vec_perm_builder sel (nunits, 2, 3);
9626           machine_mode mode = TYPE_MODE (vectype);
9627           sel.quick_grow (6);
9628           for (i = 0; i < 3; i++)
9629             {
9630               sel[i * 2] = i;
9631               sel[i * 2 + 1] = i + nunits;
9632             }
9633           vec_perm_indices indices (sel, 2, nunits);
9634           if (!can_vec_perm_const_p (mode, mode, indices))
9635             return false;
9636         }
9637       break;
9638
9639     case vect_step_op_mul:
9640       {
9641         /* Check for backend support of MULT_EXPR.  */
9642         if (!directly_supported_p (MULT_EXPR, vectype))
9643           return false;
9644
9645         /* ?? How to construct vector step for variable number vector.
9646            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9647         if (!vf.is_constant ())
9648           return false;
9649       }
9650       break;
9651
9652     case vect_step_op_shr:
9653       /* Check for backend support of RSHIFT_EXPR.  */
9654       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9655         return false;
9656
9657       /* Don't shift more than type precision to avoid UD.  */
9658       if (!tree_fits_uhwi_p (step_expr)
9659           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9660                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9661         return false;
9662       break;
9663
9664     case vect_step_op_shl:
9665       /* Check for backend support of RSHIFT_EXPR.  */
9666       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9667         return false;
9668
9669       /* Don't shift more than type precision to avoid UD.  */
9670       if (!tree_fits_uhwi_p (step_expr)
9671           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9672                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9673         return false;
9674
9675       break;
9676
9677     default:
9678       gcc_unreachable ();
9679     }
9680
9681   if (!vec_stmt) /* transformation not required.  */
9682     {
9683       unsigned inside_cost = 0, prologue_cost = 0;
9684       /* loop cost for vec_loop. Neg induction doesn't have any
9685          inside_cost.  */
9686       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9687                                       stmt_info, 0, vect_body);
9688
9689       /* loop cost for vec_loop. Neg induction doesn't have any
9690          inside_cost.  */
9691       if (induction_type == vect_step_op_neg)
9692         inside_cost = 0;
9693
9694       /* prologue cost for vec_init and vec_step.  */
9695       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9696                                         stmt_info, 0, vect_prologue);
9697
9698       if (dump_enabled_p ())
9699         dump_printf_loc (MSG_NOTE, vect_location,
9700                          "vect_model_induction_cost: inside_cost = %d, "
9701                          "prologue_cost = %d. \n", inside_cost,
9702                          prologue_cost);
9703
9704       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9705       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9706       return true;
9707     }
9708
9709   /* Transform.  */
9710
9711   /* Compute a vector variable, initialized with the first VF values of
9712      the induction variable.  E.g., for an iv with IV_PHI='X' and
9713      evolution S, for a vector of 4 units, we want to compute:
9714      [X, X + S, X + 2*S, X + 3*S].  */
9715
9716   if (dump_enabled_p ())
9717     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9718
9719   pe = loop_preheader_edge (iv_loop);
9720   /* Find the first insertion point in the BB.  */
9721   basic_block bb = gimple_bb (phi);
9722   si = gsi_after_labels (bb);
9723
9724   gimple_seq stmts = NULL;
9725
9726   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9727   /* If we are using the loop mask to "peel" for alignment then we need
9728      to adjust the start value here.  */
9729   if (niters_skip != NULL_TREE)
9730     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9731                                              step_expr, induction_type);
9732
9733   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9734                                             step_expr, nunits, vectype,
9735                                             induction_type);
9736   if (stmts)
9737     {
9738       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9739       gcc_assert (!new_bb);
9740     }
9741
9742   stmts = NULL;
9743   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9744                                             vf, induction_type);
9745   if (stmts)
9746     {
9747       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9748       gcc_assert (!new_bb);
9749     }
9750
9751   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9752                                                 new_name, vectype,
9753                                                 induction_type);
9754   /* Create the following def-use cycle:
9755      loop prolog:
9756      vec_init = ...
9757      vec_step = ...
9758      loop:
9759      vec_iv = PHI <vec_init, vec_loop>
9760      ...
9761      STMT
9762      ...
9763      vec_loop = vec_iv + vec_step;  */
9764
9765   /* Create the induction-phi that defines the induction-operand.  */
9766   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9767   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9768   induc_def = PHI_RESULT (induction_phi);
9769
9770   /* Create the iv update inside the loop.  */
9771   stmts = NULL;
9772   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9773                                       induc_def, vec_step,
9774                                       induction_type);
9775
9776   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9777   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9778
9779   /* Set the arguments of the phi node:  */
9780   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9781   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9782                UNKNOWN_LOCATION);
9783
9784   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9785   *vec_stmt = induction_phi;
9786
9787   /* In case that vectorization factor (VF) is bigger than the number
9788      of elements that we can fit in a vectype (nunits), we have to generate
9789      more than one vector stmt - i.e - we need to "unroll" the
9790      vector stmt by a factor VF/nunits.  For more details see documentation
9791      in vectorizable_operation.  */
9792
9793   if (ncopies > 1)
9794     {
9795       stmts = NULL;
9796       /* FORNOW. This restriction should be relaxed.  */
9797       gcc_assert (!nested_in_vect_loop);
9798
9799       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9800                                                 nunits, induction_type);
9801
9802       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9803                                                     new_name, vectype,
9804                                                     induction_type);
9805       vec_def = induc_def;
9806       for (i = 1; i < ncopies; i++)
9807         {
9808           /* vec_i = vec_prev + vec_step.  */
9809           stmts = NULL;
9810           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9811                                               vec_def, vec_step,
9812                                               induction_type);
9813           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9814           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9815           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9816         }
9817     }
9818
9819   if (dump_enabled_p ())
9820     dump_printf_loc (MSG_NOTE, vect_location,
9821                      "transform induction: created def-use cycle: %G%G",
9822                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9823
9824   return true;
9825 }
9826
9827 /* Function vectorizable_induction
9828
9829    Check if STMT_INFO performs an induction computation that can be vectorized.
9830    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9831    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9832    Return true if STMT_INFO is vectorizable in this way.  */
9833
9834 bool
9835 vectorizable_induction (loop_vec_info loop_vinfo,
9836                         stmt_vec_info stmt_info,
9837                         gimple **vec_stmt, slp_tree slp_node,
9838                         stmt_vector_for_cost *cost_vec)
9839 {
9840   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9841   unsigned ncopies;
9842   bool nested_in_vect_loop = false;
9843   class loop *iv_loop;
9844   tree vec_def;
9845   edge pe = loop_preheader_edge (loop);
9846   basic_block new_bb;
9847   tree new_vec, vec_init, vec_step, t;
9848   tree new_name;
9849   gimple *new_stmt;
9850   gphi *induction_phi;
9851   tree induc_def, vec_dest;
9852   tree init_expr, step_expr;
9853   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9854   unsigned i;
9855   tree expr;
9856   gimple_stmt_iterator si;
9857   enum vect_induction_op_type induction_type
9858     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9859
9860   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9861   if (!phi)
9862     return false;
9863
9864   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9865     return false;
9866
9867   /* Make sure it was recognized as induction computation.  */
9868   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9869     return false;
9870
9871   /* Handle nonlinear induction in a separate place.  */
9872   if (induction_type != vect_step_op_add)
9873     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9874                                              vec_stmt, slp_node, cost_vec);
9875
9876   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9877   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9878
9879   if (slp_node)
9880     ncopies = 1;
9881   else
9882     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9883   gcc_assert (ncopies >= 1);
9884
9885   /* FORNOW. These restrictions should be relaxed.  */
9886   if (nested_in_vect_loop_p (loop, stmt_info))
9887     {
9888       imm_use_iterator imm_iter;
9889       use_operand_p use_p;
9890       gimple *exit_phi;
9891       edge latch_e;
9892       tree loop_arg;
9893
9894       if (ncopies > 1)
9895         {
9896           if (dump_enabled_p ())
9897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9898                              "multiple types in nested loop.\n");
9899           return false;
9900         }
9901
9902       exit_phi = NULL;
9903       latch_e = loop_latch_edge (loop->inner);
9904       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9905       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9906         {
9907           gimple *use_stmt = USE_STMT (use_p);
9908           if (is_gimple_debug (use_stmt))
9909             continue;
9910
9911           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9912             {
9913               exit_phi = use_stmt;
9914               break;
9915             }
9916         }
9917       if (exit_phi)
9918         {
9919           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9920           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9921                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9922             {
9923               if (dump_enabled_p ())
9924                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9925                                  "inner-loop induction only used outside "
9926                                  "of the outer vectorized loop.\n");
9927               return false;
9928             }
9929         }
9930
9931       nested_in_vect_loop = true;
9932       iv_loop = loop->inner;
9933     }
9934   else
9935     iv_loop = loop;
9936   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9937
9938   if (slp_node && !nunits.is_constant ())
9939     {
9940       /* The current SLP code creates the step value element-by-element.  */
9941       if (dump_enabled_p ())
9942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9943                          "SLP induction not supported for variable-length"
9944                          " vectors.\n");
9945       return false;
9946     }
9947
9948   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9949     {
9950       if (dump_enabled_p ())
9951         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9952                          "floating point induction vectorization disabled\n");
9953       return false;
9954     }
9955
9956   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9957   gcc_assert (step_expr != NULL_TREE);
9958   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9959
9960   /* Check for backend support of PLUS/MINUS_EXPR. */
9961   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9962       || !directly_supported_p (MINUS_EXPR, step_vectype))
9963     return false;
9964
9965   if (!vec_stmt) /* transformation not required.  */
9966     {
9967       unsigned inside_cost = 0, prologue_cost = 0;
9968       if (slp_node)
9969         {
9970           /* We eventually need to set a vector type on invariant
9971              arguments.  */
9972           unsigned j;
9973           slp_tree child;
9974           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9975             if (!vect_maybe_update_slp_op_vectype
9976                 (child, SLP_TREE_VECTYPE (slp_node)))
9977               {
9978                 if (dump_enabled_p ())
9979                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9980                                    "incompatible vector types for "
9981                                    "invariants\n");
9982                 return false;
9983               }
9984           /* loop cost for vec_loop.  */
9985           inside_cost
9986             = record_stmt_cost (cost_vec,
9987                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9988                                 vector_stmt, stmt_info, 0, vect_body);
9989           /* prologue cost for vec_init (if not nested) and step.  */
9990           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9991                                             scalar_to_vec,
9992                                             stmt_info, 0, vect_prologue);
9993         }
9994       else /* if (!slp_node) */
9995         {
9996           /* loop cost for vec_loop.  */
9997           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9998                                           stmt_info, 0, vect_body);
9999           /* prologue cost for vec_init and vec_step.  */
10000           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10001                                             stmt_info, 0, vect_prologue);
10002         }
10003       if (dump_enabled_p ())
10004         dump_printf_loc (MSG_NOTE, vect_location,
10005                          "vect_model_induction_cost: inside_cost = %d, "
10006                          "prologue_cost = %d .\n", inside_cost,
10007                          prologue_cost);
10008
10009       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10010       DUMP_VECT_SCOPE ("vectorizable_induction");
10011       return true;
10012     }
10013
10014   /* Transform.  */
10015
10016   /* Compute a vector variable, initialized with the first VF values of
10017      the induction variable.  E.g., for an iv with IV_PHI='X' and
10018      evolution S, for a vector of 4 units, we want to compute:
10019      [X, X + S, X + 2*S, X + 3*S].  */
10020
10021   if (dump_enabled_p ())
10022     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10023
10024   pe = loop_preheader_edge (iv_loop);
10025   /* Find the first insertion point in the BB.  */
10026   basic_block bb = gimple_bb (phi);
10027   si = gsi_after_labels (bb);
10028
10029   /* For SLP induction we have to generate several IVs as for example
10030      with group size 3 we need
10031        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10032        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10033   if (slp_node)
10034     {
10035       /* Enforced above.  */
10036       unsigned int const_nunits = nunits.to_constant ();
10037
10038       /* The initial values are vectorized, but any lanes > group_size
10039          need adjustment.  */
10040       slp_tree init_node
10041         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10042
10043       /* Gather steps.  Since we do not vectorize inductions as
10044          cycles we have to reconstruct the step from SCEV data.  */
10045       unsigned group_size = SLP_TREE_LANES (slp_node);
10046       tree *steps = XALLOCAVEC (tree, group_size);
10047       tree *inits = XALLOCAVEC (tree, group_size);
10048       stmt_vec_info phi_info;
10049       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10050         {
10051           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10052           if (!init_node)
10053             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10054                                            pe->dest_idx);
10055         }
10056
10057       /* Now generate the IVs.  */
10058       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10059       gcc_assert ((const_nunits * nvects) % group_size == 0);
10060       unsigned nivs;
10061       if (nested_in_vect_loop)
10062         nivs = nvects;
10063       else
10064         {
10065           /* Compute the number of distinct IVs we need.  First reduce
10066              group_size if it is a multiple of const_nunits so we get
10067              one IV for a group_size of 4 but const_nunits 2.  */
10068           unsigned group_sizep = group_size;
10069           if (group_sizep % const_nunits == 0)
10070             group_sizep = group_sizep / const_nunits;
10071           nivs = least_common_multiple (group_sizep,
10072                                         const_nunits) / const_nunits;
10073         }
10074       tree stept = TREE_TYPE (step_vectype);
10075       tree lupdate_mul = NULL_TREE;
10076       if (!nested_in_vect_loop)
10077         {
10078           /* The number of iterations covered in one vector iteration.  */
10079           unsigned lup_mul = (nvects * const_nunits) / group_size;
10080           lupdate_mul
10081             = build_vector_from_val (step_vectype,
10082                                      SCALAR_FLOAT_TYPE_P (stept)
10083                                      ? build_real_from_wide (stept, lup_mul,
10084                                                              UNSIGNED)
10085                                      : build_int_cstu (stept, lup_mul));
10086         }
10087       tree peel_mul = NULL_TREE;
10088       gimple_seq init_stmts = NULL;
10089       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10090         {
10091           if (SCALAR_FLOAT_TYPE_P (stept))
10092             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10093                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10094           else
10095             peel_mul = gimple_convert (&init_stmts, stept,
10096                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10097           peel_mul = gimple_build_vector_from_val (&init_stmts,
10098                                                    step_vectype, peel_mul);
10099         }
10100       unsigned ivn;
10101       auto_vec<tree> vec_steps;
10102       for (ivn = 0; ivn < nivs; ++ivn)
10103         {
10104           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10105           tree_vector_builder init_elts (vectype, const_nunits, 1);
10106           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10107           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10108             {
10109               /* The scalar steps of the IVs.  */
10110               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10111               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10112               step_elts.quick_push (elt);
10113               if (!init_node)
10114                 {
10115                   /* The scalar inits of the IVs if not vectorized.  */
10116                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10117                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10118                                                   TREE_TYPE (elt)))
10119                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10120                                         TREE_TYPE (vectype), elt);
10121                   init_elts.quick_push (elt);
10122                 }
10123               /* The number of steps to add to the initial values.  */
10124               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10125               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10126                                    ? build_real_from_wide (stept,
10127                                                            mul_elt, UNSIGNED)
10128                                    : build_int_cstu (stept, mul_elt));
10129             }
10130           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10131           vec_steps.safe_push (vec_step);
10132           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10133           if (peel_mul)
10134             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10135                                      step_mul, peel_mul);
10136           if (!init_node)
10137             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10138
10139           /* Create the induction-phi that defines the induction-operand.  */
10140           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10141                                             "vec_iv_");
10142           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10143           induc_def = PHI_RESULT (induction_phi);
10144
10145           /* Create the iv update inside the loop  */
10146           tree up = vec_step;
10147           if (lupdate_mul)
10148             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10149                                vec_step, lupdate_mul);
10150           gimple_seq stmts = NULL;
10151           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10152           vec_def = gimple_build (&stmts,
10153                                   PLUS_EXPR, step_vectype, vec_def, up);
10154           vec_def = gimple_convert (&stmts, vectype, vec_def);
10155           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10156           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10157                        UNKNOWN_LOCATION);
10158
10159           if (init_node)
10160             vec_init = vect_get_slp_vect_def (init_node, ivn);
10161           if (!nested_in_vect_loop
10162               && !integer_zerop (step_mul))
10163             {
10164               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10165               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10166                                  vec_step, step_mul);
10167               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10168                                       vec_def, up);
10169               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10170             }
10171
10172           /* Set the arguments of the phi node:  */
10173           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10174
10175           slp_node->push_vec_def (induction_phi);
10176         }
10177       if (!nested_in_vect_loop)
10178         {
10179           /* Fill up to the number of vectors we need for the whole group.  */
10180           nivs = least_common_multiple (group_size,
10181                                         const_nunits) / const_nunits;
10182           vec_steps.reserve (nivs-ivn);
10183           for (; ivn < nivs; ++ivn)
10184             {
10185               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10186               vec_steps.quick_push (vec_steps[0]);
10187             }
10188         }
10189
10190       /* Re-use IVs when we can.  We are generating further vector
10191          stmts by adding VF' * stride to the IVs generated above.  */
10192       if (ivn < nvects)
10193         {
10194           unsigned vfp
10195             = least_common_multiple (group_size, const_nunits) / group_size;
10196           tree lupdate_mul
10197             = build_vector_from_val (step_vectype,
10198                                      SCALAR_FLOAT_TYPE_P (stept)
10199                                      ? build_real_from_wide (stept,
10200                                                              vfp, UNSIGNED)
10201                                      : build_int_cstu (stept, vfp));
10202           for (; ivn < nvects; ++ivn)
10203             {
10204               gimple *iv
10205                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10206               tree def = gimple_get_lhs (iv);
10207               if (ivn < 2*nivs)
10208                 vec_steps[ivn - nivs]
10209                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10210                                   vec_steps[ivn - nivs], lupdate_mul);
10211               gimple_seq stmts = NULL;
10212               def = gimple_convert (&stmts, step_vectype, def);
10213               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10214                                   def, vec_steps[ivn % nivs]);
10215               def = gimple_convert (&stmts, vectype, def);
10216               if (gimple_code (iv) == GIMPLE_PHI)
10217                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10218               else
10219                 {
10220                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10221                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10222                 }
10223               slp_node->push_vec_def (def);
10224             }
10225         }
10226
10227       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10228       gcc_assert (!new_bb);
10229
10230       return true;
10231     }
10232
10233   init_expr = vect_phi_initial_value (phi);
10234
10235   gimple_seq stmts = NULL;
10236   if (!nested_in_vect_loop)
10237     {
10238       /* Convert the initial value to the IV update type.  */
10239       tree new_type = TREE_TYPE (step_expr);
10240       init_expr = gimple_convert (&stmts, new_type, init_expr);
10241
10242       /* If we are using the loop mask to "peel" for alignment then we need
10243          to adjust the start value here.  */
10244       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10245       if (skip_niters != NULL_TREE)
10246         {
10247           if (FLOAT_TYPE_P (vectype))
10248             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10249                                         skip_niters);
10250           else
10251             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10252           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10253                                          skip_niters, step_expr);
10254           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10255                                     init_expr, skip_step);
10256         }
10257     }
10258
10259   if (stmts)
10260     {
10261       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10262       gcc_assert (!new_bb);
10263     }
10264
10265   /* Create the vector that holds the initial_value of the induction.  */
10266   if (nested_in_vect_loop)
10267     {
10268       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10269          been created during vectorization of previous stmts.  We obtain it
10270          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10271       auto_vec<tree> vec_inits;
10272       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10273                                      init_expr, &vec_inits);
10274       vec_init = vec_inits[0];
10275       /* If the initial value is not of proper type, convert it.  */
10276       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10277         {
10278           new_stmt
10279             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10280                                                           vect_simple_var,
10281                                                           "vec_iv_"),
10282                                    VIEW_CONVERT_EXPR,
10283                                    build1 (VIEW_CONVERT_EXPR, vectype,
10284                                            vec_init));
10285           vec_init = gimple_assign_lhs (new_stmt);
10286           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10287                                                  new_stmt);
10288           gcc_assert (!new_bb);
10289         }
10290     }
10291   else
10292     {
10293       /* iv_loop is the loop to be vectorized. Create:
10294          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10295       stmts = NULL;
10296       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10297
10298       unsigned HOST_WIDE_INT const_nunits;
10299       if (nunits.is_constant (&const_nunits))
10300         {
10301           tree_vector_builder elts (step_vectype, const_nunits, 1);
10302           elts.quick_push (new_name);
10303           for (i = 1; i < const_nunits; i++)
10304             {
10305               /* Create: new_name_i = new_name + step_expr  */
10306               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10307                                        new_name, step_expr);
10308               elts.quick_push (new_name);
10309             }
10310           /* Create a vector from [new_name_0, new_name_1, ...,
10311              new_name_nunits-1]  */
10312           vec_init = gimple_build_vector (&stmts, &elts);
10313         }
10314       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10315         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10316         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10317                                  new_name, step_expr);
10318       else
10319         {
10320           /* Build:
10321                 [base, base, base, ...]
10322                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10323           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10324           gcc_assert (flag_associative_math);
10325           tree index = build_index_vector (step_vectype, 0, 1);
10326           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10327                                                         new_name);
10328           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10329                                                         step_expr);
10330           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10331           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10332                                    vec_init, step_vec);
10333           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10334                                    vec_init, base_vec);
10335         }
10336       vec_init = gimple_convert (&stmts, vectype, vec_init);
10337
10338       if (stmts)
10339         {
10340           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10341           gcc_assert (!new_bb);
10342         }
10343     }
10344
10345
10346   /* Create the vector that holds the step of the induction.  */
10347   gimple_stmt_iterator *step_iv_si = NULL;
10348   if (nested_in_vect_loop)
10349     /* iv_loop is nested in the loop to be vectorized. Generate:
10350        vec_step = [S, S, S, S]  */
10351     new_name = step_expr;
10352   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10353     {
10354       /* When we're using loop_len produced by SELEC_VL, the non-final
10355          iterations are not always processing VF elements.  So vectorize
10356          induction variable instead of
10357
10358            _21 = vect_vec_iv_.6_22 + { VF, ... };
10359
10360          We should generate:
10361
10362            _35 = .SELECT_VL (ivtmp_33, VF);
10363            vect_cst__22 = [vec_duplicate_expr] _35;
10364            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10365       gcc_assert (!slp_node);
10366       gimple_seq seq = NULL;
10367       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10368       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10369       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10370                                                  unshare_expr (len)),
10371                                    &seq, true, NULL_TREE);
10372       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10373                                step_expr);
10374       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10375       step_iv_si = &si;
10376     }
10377   else
10378     {
10379       /* iv_loop is the loop to be vectorized. Generate:
10380           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10381       gimple_seq seq = NULL;
10382       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10383         {
10384           expr = build_int_cst (integer_type_node, vf);
10385           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10386         }
10387       else
10388         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10389       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10390                                expr, step_expr);
10391       if (seq)
10392         {
10393           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10394           gcc_assert (!new_bb);
10395         }
10396     }
10397
10398   t = unshare_expr (new_name);
10399   gcc_assert (CONSTANT_CLASS_P (new_name)
10400               || TREE_CODE (new_name) == SSA_NAME);
10401   new_vec = build_vector_from_val (step_vectype, t);
10402   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10403                                new_vec, step_vectype, step_iv_si);
10404
10405
10406   /* Create the following def-use cycle:
10407      loop prolog:
10408          vec_init = ...
10409          vec_step = ...
10410      loop:
10411          vec_iv = PHI <vec_init, vec_loop>
10412          ...
10413          STMT
10414          ...
10415          vec_loop = vec_iv + vec_step;  */
10416
10417   /* Create the induction-phi that defines the induction-operand.  */
10418   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10419   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10420   induc_def = PHI_RESULT (induction_phi);
10421
10422   /* Create the iv update inside the loop  */
10423   stmts = NULL;
10424   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10425   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10426   vec_def = gimple_convert (&stmts, vectype, vec_def);
10427   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10428   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10429
10430   /* Set the arguments of the phi node:  */
10431   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10432   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10433                UNKNOWN_LOCATION);
10434
10435   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10436   *vec_stmt = induction_phi;
10437
10438   /* In case that vectorization factor (VF) is bigger than the number
10439      of elements that we can fit in a vectype (nunits), we have to generate
10440      more than one vector stmt - i.e - we need to "unroll" the
10441      vector stmt by a factor VF/nunits.  For more details see documentation
10442      in vectorizable_operation.  */
10443
10444   if (ncopies > 1)
10445     {
10446       gimple_seq seq = NULL;
10447       /* FORNOW. This restriction should be relaxed.  */
10448       gcc_assert (!nested_in_vect_loop);
10449       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10450       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10451
10452       /* Create the vector that holds the step of the induction.  */
10453       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10454         {
10455           expr = build_int_cst (integer_type_node, nunits);
10456           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10457         }
10458       else
10459         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10460       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10461                                expr, step_expr);
10462       if (seq)
10463         {
10464           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10465           gcc_assert (!new_bb);
10466         }
10467
10468       t = unshare_expr (new_name);
10469       gcc_assert (CONSTANT_CLASS_P (new_name)
10470                   || TREE_CODE (new_name) == SSA_NAME);
10471       new_vec = build_vector_from_val (step_vectype, t);
10472       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10473                                    new_vec, step_vectype, NULL);
10474
10475       vec_def = induc_def;
10476       for (i = 1; i < ncopies + 1; i++)
10477         {
10478           /* vec_i = vec_prev + vec_step  */
10479           gimple_seq stmts = NULL;
10480           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10481           vec_def = gimple_build (&stmts,
10482                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10483           vec_def = gimple_convert (&stmts, vectype, vec_def);
10484
10485           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10486           if (i < ncopies)
10487             {
10488               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10489               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10490             }
10491           else
10492             {
10493               /* vec_1 = vec_iv + (VF/n * S)
10494                  vec_2 = vec_1 + (VF/n * S)
10495                  ...
10496                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10497
10498                  vec_n is used as vec_loop to save the large step register and
10499                  related operations.  */
10500               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10501                            UNKNOWN_LOCATION);
10502             }
10503         }
10504     }
10505
10506   if (dump_enabled_p ())
10507     dump_printf_loc (MSG_NOTE, vect_location,
10508                      "transform induction: created def-use cycle: %G%G",
10509                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10510
10511   return true;
10512 }
10513
10514 /* Function vectorizable_live_operation.
10515
10516    STMT_INFO computes a value that is used outside the loop.  Check if
10517    it can be supported.  */
10518
10519 bool
10520 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10521                              slp_tree slp_node, slp_instance slp_node_instance,
10522                              int slp_index, bool vec_stmt_p,
10523                              stmt_vector_for_cost *cost_vec)
10524 {
10525   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10526   imm_use_iterator imm_iter;
10527   tree lhs, lhs_type, bitsize;
10528   tree vectype = (slp_node
10529                   ? SLP_TREE_VECTYPE (slp_node)
10530                   : STMT_VINFO_VECTYPE (stmt_info));
10531   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10532   int ncopies;
10533   gimple *use_stmt;
10534   auto_vec<tree> vec_oprnds;
10535   int vec_entry = 0;
10536   poly_uint64 vec_index = 0;
10537
10538   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10539
10540   /* If a stmt of a reduction is live, vectorize it via
10541      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10542      validity so just trigger the transform here.  */
10543   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10544     {
10545       if (!vec_stmt_p)
10546         return true;
10547       if (slp_node)
10548         {
10549           /* For reduction chains the meta-info is attached to
10550              the group leader.  */
10551           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10552             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10553           /* For SLP reductions we vectorize the epilogue for
10554              all involved stmts together.  */
10555           else if (slp_index != 0)
10556             return true;
10557         }
10558       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10559       gcc_assert (reduc_info->is_reduc_info);
10560       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10561           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10562         return true;
10563       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10564                                         slp_node_instance);
10565       return true;
10566     }
10567
10568   /* If STMT is not relevant and it is a simple assignment and its inputs are
10569      invariant then it can remain in place, unvectorized.  The original last
10570      scalar value that it computes will be used.  */
10571   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10572     {
10573       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10574       if (dump_enabled_p ())
10575         dump_printf_loc (MSG_NOTE, vect_location,
10576                          "statement is simple and uses invariant.  Leaving in "
10577                          "place.\n");
10578       return true;
10579     }
10580
10581   if (slp_node)
10582     ncopies = 1;
10583   else
10584     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10585
10586   if (slp_node)
10587     {
10588       gcc_assert (slp_index >= 0);
10589
10590       /* Get the last occurrence of the scalar index from the concatenation of
10591          all the slp vectors. Calculate which slp vector it is and the index
10592          within.  */
10593       int num_scalar = SLP_TREE_LANES (slp_node);
10594       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10595       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10596
10597       /* Calculate which vector contains the result, and which lane of
10598          that vector we need.  */
10599       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10600         {
10601           if (dump_enabled_p ())
10602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10603                              "Cannot determine which vector holds the"
10604                              " final result.\n");
10605           return false;
10606         }
10607     }
10608
10609   if (!vec_stmt_p)
10610     {
10611       /* No transformation required.  */
10612       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10613         {
10614           if (slp_node)
10615             {
10616               if (dump_enabled_p ())
10617                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10618                                  "can't operate on partial vectors "
10619                                  "because an SLP statement is live after "
10620                                  "the loop.\n");
10621               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10622             }
10623           else if (ncopies > 1)
10624             {
10625               if (dump_enabled_p ())
10626                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10627                                  "can't operate on partial vectors "
10628                                  "because ncopies is greater than 1.\n");
10629               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10630             }
10631           else
10632             {
10633               gcc_assert (ncopies == 1 && !slp_node);
10634               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10635                                                   OPTIMIZE_FOR_SPEED))
10636                 vect_record_loop_mask (loop_vinfo,
10637                                        &LOOP_VINFO_MASKS (loop_vinfo),
10638                                        1, vectype, NULL);
10639               else if (can_vec_extract_var_idx_p (
10640                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10641                 vect_record_loop_len (loop_vinfo,
10642                                       &LOOP_VINFO_LENS (loop_vinfo),
10643                                       1, vectype, 1);
10644               else
10645                 {
10646                   if (dump_enabled_p ())
10647                     dump_printf_loc (
10648                       MSG_MISSED_OPTIMIZATION, vect_location,
10649                       "can't operate on partial vectors "
10650                       "because the target doesn't support extract "
10651                       "last reduction.\n");
10652                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10653                 }
10654             }
10655         }
10656       /* ???  Enable for loop costing as well.  */
10657       if (!loop_vinfo)
10658         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10659                           0, vect_epilogue);
10660       return true;
10661     }
10662
10663   /* Use the lhs of the original scalar statement.  */
10664   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10665   if (dump_enabled_p ())
10666     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10667                      "stmt %G", stmt);
10668
10669   lhs = gimple_get_lhs (stmt);
10670   lhs_type = TREE_TYPE (lhs);
10671
10672   bitsize = vector_element_bits_tree (vectype);
10673
10674   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10675   tree vec_lhs, bitstart;
10676   gimple *vec_stmt;
10677   if (slp_node)
10678     {
10679       gcc_assert (!loop_vinfo
10680                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10681                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10682
10683       /* Get the correct slp vectorized stmt.  */
10684       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10685       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10686
10687       /* Get entry to use.  */
10688       bitstart = bitsize_int (vec_index);
10689       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10690     }
10691   else
10692     {
10693       /* For multiple copies, get the last copy.  */
10694       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10695       vec_lhs = gimple_get_lhs (vec_stmt);
10696
10697       /* Get the last lane in the vector.  */
10698       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10699     }
10700
10701   if (loop_vinfo)
10702     {
10703       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10704          requirement, insert one phi node for it.  It looks like:
10705            loop;
10706          BB:
10707            # lhs' = PHI <lhs>
10708          ==>
10709            loop;
10710          BB:
10711            # vec_lhs' = PHI <vec_lhs>
10712            new_tree = lane_extract <vec_lhs', ...>;
10713            lhs' = new_tree;  */
10714
10715       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10716       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10717       gcc_assert (single_pred_p (exit_bb));
10718
10719       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10720       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10721       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10722
10723       gimple_seq stmts = NULL;
10724       tree new_tree;
10725       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10726         {
10727           /* Emit:
10728
10729                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10730
10731              where VEC_LHS is the vectorized live-out result and MASK is
10732              the loop mask for the final iteration.  */
10733           gcc_assert (ncopies == 1 && !slp_node);
10734           gimple_seq tem = NULL;
10735           gimple_stmt_iterator gsi = gsi_last (tem);
10736           tree len
10737             = vect_get_loop_len (loop_vinfo, &gsi,
10738                                  &LOOP_VINFO_LENS (loop_vinfo),
10739                                  1, vectype, 0, 0);
10740
10741           /* BIAS - 1.  */
10742           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10743           tree bias_minus_one
10744             = int_const_binop (MINUS_EXPR,
10745                                build_int_cst (TREE_TYPE (len), biasval),
10746                                build_one_cst (TREE_TYPE (len)));
10747
10748           /* LAST_INDEX = LEN + (BIAS - 1).  */
10749           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10750                                           len, bias_minus_one);
10751
10752           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10753           tree scalar_res
10754             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10755                             vec_lhs_phi, last_index);
10756
10757           /* Convert the extracted vector element to the scalar type.  */
10758           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10759         }
10760       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10761         {
10762           /* Emit:
10763
10764                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10765
10766              where VEC_LHS is the vectorized live-out result and MASK is
10767              the loop mask for the final iteration.  */
10768           gcc_assert (ncopies == 1 && !slp_node);
10769           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10770           gimple_seq tem = NULL;
10771           gimple_stmt_iterator gsi = gsi_last (tem);
10772           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10773                                           &LOOP_VINFO_MASKS (loop_vinfo),
10774                                           1, vectype, 0);
10775           gimple_seq_add_seq (&stmts, tem);
10776           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10777                                           mask, vec_lhs_phi);
10778
10779           /* Convert the extracted vector element to the scalar type.  */
10780           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10781         }
10782       else
10783         {
10784           tree bftype = TREE_TYPE (vectype);
10785           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10786             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10787           new_tree = build3 (BIT_FIELD_REF, bftype,
10788                              vec_lhs_phi, bitsize, bitstart);
10789           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10790                                            &stmts, true, NULL_TREE);
10791         }
10792
10793       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10794       if (stmts)
10795         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10796
10797       /* Remove existing phis that copy from lhs and create copies
10798          from new_tree.  */
10799       gimple_stmt_iterator gsi;
10800       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10801         {
10802           gimple *phi = gsi_stmt (gsi);
10803           if ((gimple_phi_arg_def (phi, 0) == lhs))
10804             {
10805               remove_phi_node (&gsi, false);
10806               tree lhs_phi = gimple_phi_result (phi);
10807               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10808               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10809             }
10810           else
10811             gsi_next (&gsi);
10812         }
10813
10814       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10815       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10816         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10817     }
10818   else
10819     {
10820       /* For basic-block vectorization simply insert the lane-extraction.  */
10821       tree bftype = TREE_TYPE (vectype);
10822       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10823         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10824       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10825                               vec_lhs, bitsize, bitstart);
10826       gimple_seq stmts = NULL;
10827       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10828                                        &stmts, true, NULL_TREE);
10829       if (TREE_CODE (new_tree) == SSA_NAME
10830           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10831         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10832       if (is_a <gphi *> (vec_stmt))
10833         {
10834           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10835           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10836         }
10837       else
10838         {
10839           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10840           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10841         }
10842
10843       /* Replace use of lhs with newly computed result.  If the use stmt is a
10844          single arg PHI, just replace all uses of PHI result.  It's necessary
10845          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10846       use_operand_p use_p;
10847       stmt_vec_info use_stmt_info;
10848       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10849         if (!is_gimple_debug (use_stmt)
10850             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10851                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10852           {
10853             /* ???  This can happen when the live lane ends up being
10854                rooted in a vector construction code-generated by an
10855                external SLP node (and code-generation for that already
10856                happened).  See gcc.dg/vect/bb-slp-47.c.
10857                Doing this is what would happen if that vector CTOR
10858                were not code-generated yet so it is not too bad.
10859                ???  In fact we'd likely want to avoid this situation
10860                in the first place.  */
10861             if (TREE_CODE (new_tree) == SSA_NAME
10862                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10863                 && gimple_code (use_stmt) != GIMPLE_PHI
10864                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10865                                                 use_stmt))
10866               {
10867                 if (dump_enabled_p ())
10868                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10869                                    "Using original scalar computation for "
10870                                    "live lane because use preceeds vector "
10871                                    "def\n");
10872                 continue;
10873               }
10874             /* ???  It can also happen that we end up pulling a def into
10875                a loop where replacing out-of-loop uses would require
10876                a new LC SSA PHI node.  Retain the original scalar in
10877                those cases as well.  PR98064.  */
10878             if (TREE_CODE (new_tree) == SSA_NAME
10879                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10880                 && (gimple_bb (use_stmt)->loop_father
10881                     != gimple_bb (vec_stmt)->loop_father)
10882                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10883                                         gimple_bb (use_stmt)->loop_father))
10884               {
10885                 if (dump_enabled_p ())
10886                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10887                                    "Using original scalar computation for "
10888                                    "live lane because there is an out-of-loop "
10889                                    "definition for it\n");
10890                 continue;
10891               }
10892             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10893               SET_USE (use_p, new_tree);
10894             update_stmt (use_stmt);
10895           }
10896     }
10897
10898   return true;
10899 }
10900
10901 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10902
10903 static void
10904 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10905 {
10906   ssa_op_iter op_iter;
10907   imm_use_iterator imm_iter;
10908   def_operand_p def_p;
10909   gimple *ustmt;
10910
10911   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10912     {
10913       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10914         {
10915           basic_block bb;
10916
10917           if (!is_gimple_debug (ustmt))
10918             continue;
10919
10920           bb = gimple_bb (ustmt);
10921
10922           if (!flow_bb_inside_loop_p (loop, bb))
10923             {
10924               if (gimple_debug_bind_p (ustmt))
10925                 {
10926                   if (dump_enabled_p ())
10927                     dump_printf_loc (MSG_NOTE, vect_location,
10928                                      "killing debug use\n");
10929
10930                   gimple_debug_bind_reset_value (ustmt);
10931                   update_stmt (ustmt);
10932                 }
10933               else
10934                 gcc_unreachable ();
10935             }
10936         }
10937     }
10938 }
10939
10940 /* Given loop represented by LOOP_VINFO, return true if computation of
10941    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10942    otherwise.  */
10943
10944 static bool
10945 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10946 {
10947   /* Constant case.  */
10948   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10949     {
10950       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10951       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10952
10953       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10954       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10955       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10956         return true;
10957     }
10958
10959   widest_int max;
10960   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10961   /* Check the upper bound of loop niters.  */
10962   if (get_max_loop_iterations (loop, &max))
10963     {
10964       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10965       signop sgn = TYPE_SIGN (type);
10966       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10967       if (max < type_max)
10968         return true;
10969     }
10970   return false;
10971 }
10972
10973 /* Return a mask type with half the number of elements as OLD_TYPE,
10974    given that it should have mode NEW_MODE.  */
10975
10976 tree
10977 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10978 {
10979   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10980   return build_truth_vector_type_for_mode (nunits, new_mode);
10981 }
10982
10983 /* Return a mask type with twice as many elements as OLD_TYPE,
10984    given that it should have mode NEW_MODE.  */
10985
10986 tree
10987 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10988 {
10989   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10990   return build_truth_vector_type_for_mode (nunits, new_mode);
10991 }
10992
10993 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10994    contain a sequence of NVECTORS masks that each control a vector of type
10995    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10996    these vector masks with the vector version of SCALAR_MASK.  */
10997
10998 void
10999 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11000                        unsigned int nvectors, tree vectype, tree scalar_mask)
11001 {
11002   gcc_assert (nvectors != 0);
11003
11004   if (scalar_mask)
11005     {
11006       scalar_cond_masked_key cond (scalar_mask, nvectors);
11007       loop_vinfo->scalar_cond_masked_set.add (cond);
11008     }
11009
11010   masks->mask_set.add (std::make_pair (vectype, nvectors));
11011 }
11012
11013 /* Given a complete set of masks MASKS, extract mask number INDEX
11014    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11015    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11016
11017    See the comment above vec_loop_masks for more details about the mask
11018    arrangement.  */
11019
11020 tree
11021 vect_get_loop_mask (loop_vec_info loop_vinfo,
11022                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11023                     unsigned int nvectors, tree vectype, unsigned int index)
11024 {
11025   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11026       == vect_partial_vectors_while_ult)
11027     {
11028       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11029       tree mask_type = rgm->type;
11030
11031       /* Populate the rgroup's mask array, if this is the first time we've
11032          used it.  */
11033       if (rgm->controls.is_empty ())
11034         {
11035           rgm->controls.safe_grow_cleared (nvectors, true);
11036           for (unsigned int i = 0; i < nvectors; ++i)
11037             {
11038               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11039               /* Provide a dummy definition until the real one is available.  */
11040               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11041               rgm->controls[i] = mask;
11042             }
11043         }
11044
11045       tree mask = rgm->controls[index];
11046       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11047                     TYPE_VECTOR_SUBPARTS (vectype)))
11048         {
11049           /* A loop mask for data type X can be reused for data type Y
11050              if X has N times more elements than Y and if Y's elements
11051              are N times bigger than X's.  In this case each sequence
11052              of N elements in the loop mask will be all-zero or all-one.
11053              We can then view-convert the mask so that each sequence of
11054              N elements is replaced by a single element.  */
11055           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11056                                   TYPE_VECTOR_SUBPARTS (vectype)));
11057           gimple_seq seq = NULL;
11058           mask_type = truth_type_for (vectype);
11059           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11060           if (seq)
11061             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11062         }
11063       return mask;
11064     }
11065   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11066            == vect_partial_vectors_avx512)
11067     {
11068       /* The number of scalars per iteration and the number of vectors are
11069          both compile-time constants.  */
11070       unsigned int nscalars_per_iter
11071         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11072                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11073
11074       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11075
11076       /* The stored nV is dependent on the mask type produced.  */
11077       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11078                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11079                   == rgm->factor);
11080       nvectors = rgm->factor;
11081
11082       /* Populate the rgroup's mask array, if this is the first time we've
11083          used it.  */
11084       if (rgm->controls.is_empty ())
11085         {
11086           rgm->controls.safe_grow_cleared (nvectors, true);
11087           for (unsigned int i = 0; i < nvectors; ++i)
11088             {
11089               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11090               /* Provide a dummy definition until the real one is available.  */
11091               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11092               rgm->controls[i] = mask;
11093             }
11094         }
11095       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11096                     TYPE_VECTOR_SUBPARTS (vectype)))
11097         return rgm->controls[index];
11098
11099       /* Split the vector if needed.  Since we are dealing with integer mode
11100          masks with AVX512 we can operate on the integer representation
11101          performing the whole vector shifting.  */
11102       unsigned HOST_WIDE_INT factor;
11103       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11104                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11105       gcc_assert (ok);
11106       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11107       tree mask_type = truth_type_for (vectype);
11108       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11109       unsigned vi = index / factor;
11110       unsigned vpart = index % factor;
11111       tree vec = rgm->controls[vi];
11112       gimple_seq seq = NULL;
11113       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11114                           lang_hooks.types.type_for_mode
11115                                 (TYPE_MODE (rgm->type), 1), vec);
11116       /* For integer mode masks simply shift the right bits into position.  */
11117       if (vpart != 0)
11118         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11119                             build_int_cst (integer_type_node,
11120                                            (TYPE_VECTOR_SUBPARTS (vectype)
11121                                             * vpart)));
11122       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11123                                     (TYPE_MODE (mask_type), 1), vec);
11124       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11125       if (seq)
11126         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11127       return vec;
11128     }
11129   else
11130     gcc_unreachable ();
11131 }
11132
11133 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11134    lengths for controlling an operation on VECTYPE.  The operation splits
11135    each element of VECTYPE into FACTOR separate subelements, measuring the
11136    length as a number of these subelements.  */
11137
11138 void
11139 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11140                       unsigned int nvectors, tree vectype, unsigned int factor)
11141 {
11142   gcc_assert (nvectors != 0);
11143   if (lens->length () < nvectors)
11144     lens->safe_grow_cleared (nvectors, true);
11145   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11146
11147   /* The number of scalars per iteration, scalar occupied bytes and
11148      the number of vectors are both compile-time constants.  */
11149   unsigned int nscalars_per_iter
11150     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11151                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11152
11153   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11154     {
11155       /* For now, we only support cases in which all loads and stores fall back
11156          to VnQI or none do.  */
11157       gcc_assert (!rgl->max_nscalars_per_iter
11158                   || (rgl->factor == 1 && factor == 1)
11159                   || (rgl->max_nscalars_per_iter * rgl->factor
11160                       == nscalars_per_iter * factor));
11161       rgl->max_nscalars_per_iter = nscalars_per_iter;
11162       rgl->type = vectype;
11163       rgl->factor = factor;
11164     }
11165 }
11166
11167 /* Given a complete set of lengths LENS, extract length number INDEX
11168    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11169    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11170    multipled by the number of elements that should be processed.
11171    Insert any set-up statements before GSI.  */
11172
11173 tree
11174 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11175                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11176                    unsigned int index, unsigned int factor)
11177 {
11178   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11179   bool use_bias_adjusted_len =
11180     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11181
11182   /* Populate the rgroup's len array, if this is the first time we've
11183      used it.  */
11184   if (rgl->controls.is_empty ())
11185     {
11186       rgl->controls.safe_grow_cleared (nvectors, true);
11187       for (unsigned int i = 0; i < nvectors; ++i)
11188         {
11189           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11190           gcc_assert (len_type != NULL_TREE);
11191
11192           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11193
11194           /* Provide a dummy definition until the real one is available.  */
11195           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11196           rgl->controls[i] = len;
11197
11198           if (use_bias_adjusted_len)
11199             {
11200               gcc_assert (i == 0);
11201               tree adjusted_len =
11202                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11203               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11204               rgl->bias_adjusted_ctrl = adjusted_len;
11205             }
11206         }
11207     }
11208
11209   if (use_bias_adjusted_len)
11210     return rgl->bias_adjusted_ctrl;
11211
11212   tree loop_len = rgl->controls[index];
11213   if (rgl->factor == 1 && factor == 1)
11214     {
11215       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11216       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11217       if (maybe_ne (nunits1, nunits2))
11218         {
11219           /* A loop len for data type X can be reused for data type Y
11220              if X has N times more elements than Y and if Y's elements
11221              are N times bigger than X's.  */
11222           gcc_assert (multiple_p (nunits1, nunits2));
11223           factor = exact_div (nunits1, nunits2).to_constant ();
11224           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11225           gimple_seq seq = NULL;
11226           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11227                                    build_int_cst (iv_type, factor));
11228           if (seq)
11229             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11230         }
11231     }
11232   return loop_len;
11233 }
11234
11235 /* Scale profiling counters by estimation for LOOP which is vectorized
11236    by factor VF.
11237    If FLAT is true, the loop we started with had unrealistically flat
11238    profile.  */
11239
11240 static void
11241 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11242 {
11243   /* For flat profiles do not scale down proportionally by VF and only
11244      cap by known iteration count bounds.  */
11245   if (flat)
11246     {
11247       if (dump_file && (dump_flags & TDF_DETAILS))
11248         fprintf (dump_file,
11249                  "Vectorized loop profile seems flat; not scaling iteration "
11250                  "count down by the vectorization factor %i\n", vf);
11251       scale_loop_profile (loop, profile_probability::always (),
11252                           get_likely_max_loop_iterations_int (loop));
11253       return;
11254     }
11255   /* Loop body executes VF fewer times and exit increases VF times.  */
11256   profile_count entry_count = loop_preheader_edge (loop)->count ();
11257
11258   /* If we have unreliable loop profile avoid dropping entry
11259      count bellow header count.  This can happen since loops
11260      has unrealistically low trip counts.  */
11261   while (vf > 1
11262          && loop->header->count > entry_count
11263          && loop->header->count < entry_count * vf)
11264     {
11265       if (dump_file && (dump_flags & TDF_DETAILS))
11266         fprintf (dump_file,
11267                  "Vectorization factor %i seems too large for profile "
11268                  "prevoiusly believed to be consistent; reducing.\n", vf);
11269       vf /= 2;
11270     }
11271
11272   if (entry_count.nonzero_p ())
11273     set_edge_probability_and_rescale_others
11274             (exit_e,
11275              entry_count.probability_in (loop->header->count / vf));
11276   /* Avoid producing very large exit probability when we do not have
11277      sensible profile.  */
11278   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11279     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11280   loop->latch->count = single_pred_edge (loop->latch)->count ();
11281
11282   scale_loop_profile (loop, profile_probability::always () / vf,
11283                       get_likely_max_loop_iterations_int (loop));
11284 }
11285
11286 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11287    latch edge values originally defined by it.  */
11288
11289 static void
11290 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11291                                      stmt_vec_info def_stmt_info)
11292 {
11293   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11294   if (!def || TREE_CODE (def) != SSA_NAME)
11295     return;
11296   stmt_vec_info phi_info;
11297   imm_use_iterator iter;
11298   use_operand_p use_p;
11299   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11300     {
11301       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11302       if (!phi)
11303         continue;
11304       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11305             && (phi_info = loop_vinfo->lookup_stmt (phi))
11306             && STMT_VINFO_RELEVANT_P (phi_info)))
11307         continue;
11308       loop_p loop = gimple_bb (phi)->loop_father;
11309       edge e = loop_latch_edge (loop);
11310       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11311         continue;
11312
11313       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11314           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11315           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11316         {
11317           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11318           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11319           gcc_assert (phi_defs.length () == latch_defs.length ());
11320           for (unsigned i = 0; i < phi_defs.length (); ++i)
11321             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11322                          gimple_get_lhs (latch_defs[i]), e,
11323                          gimple_phi_arg_location (phi, e->dest_idx));
11324         }
11325       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11326         {
11327           /* For first order recurrences we have to update both uses of
11328              the latch definition, the one in the PHI node and the one
11329              in the generated VEC_PERM_EXPR.  */
11330           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11331           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11332           gcc_assert (phi_defs.length () == latch_defs.length ());
11333           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11334           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11335           for (unsigned i = 0; i < phi_defs.length (); ++i)
11336             {
11337               gassign *perm = as_a <gassign *> (phi_defs[i]);
11338               if (i > 0)
11339                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11340               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11341               update_stmt (perm);
11342             }
11343           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11344                        gimple_phi_arg_location (phi, e->dest_idx));
11345         }
11346     }
11347 }
11348
11349 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11350    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11351    stmt_vec_info.  */
11352
11353 static bool
11354 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11355                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11356 {
11357   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11358   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11359
11360   if (dump_enabled_p ())
11361     dump_printf_loc (MSG_NOTE, vect_location,
11362                      "------>vectorizing statement: %G", stmt_info->stmt);
11363
11364   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11365     vect_loop_kill_debug_uses (loop, stmt_info);
11366
11367   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11368       && !STMT_VINFO_LIVE_P (stmt_info))
11369     {
11370       if (is_gimple_call (stmt_info->stmt)
11371           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11372         {
11373           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11374           *seen_store = stmt_info;
11375           return false;
11376         }
11377       return false;
11378     }
11379
11380   if (STMT_VINFO_VECTYPE (stmt_info))
11381     {
11382       poly_uint64 nunits
11383         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11384       if (!STMT_SLP_TYPE (stmt_info)
11385           && maybe_ne (nunits, vf)
11386           && dump_enabled_p ())
11387         /* For SLP VF is set according to unrolling factor, and not
11388            to vector size, hence for SLP this print is not valid.  */
11389         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11390     }
11391
11392   /* Pure SLP statements have already been vectorized.  We still need
11393      to apply loop vectorization to hybrid SLP statements.  */
11394   if (PURE_SLP_STMT (stmt_info))
11395     return false;
11396
11397   if (dump_enabled_p ())
11398     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11399
11400   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11401     *seen_store = stmt_info;
11402
11403   return true;
11404 }
11405
11406 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11407    in the hash_map with its corresponding values.  */
11408
11409 static tree
11410 find_in_mapping (tree t, void *context)
11411 {
11412   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11413
11414   tree *value = mapping->get (t);
11415   return value ? *value : t;
11416 }
11417
11418 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11419    original loop that has now been vectorized.
11420
11421    The inits of the data_references need to be advanced with the number of
11422    iterations of the main loop.  This has been computed in vect_do_peeling and
11423    is stored in parameter ADVANCE.  We first restore the data_references
11424    initial offset with the values recored in ORIG_DRS_INIT.
11425
11426    Since the loop_vec_info of this EPILOGUE was constructed for the original
11427    loop, its stmt_vec_infos all point to the original statements.  These need
11428    to be updated to point to their corresponding copies as well as the SSA_NAMES
11429    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11430
11431    The data_reference's connections also need to be updated.  Their
11432    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11433    stmt_vec_infos, their statements need to point to their corresponding copy,
11434    if they are gather loads or scatter stores then their reference needs to be
11435    updated to point to its corresponding copy and finally we set
11436    'base_misaligned' to false as we have already peeled for alignment in the
11437    prologue of the main loop.  */
11438
11439 static void
11440 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11441 {
11442   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11443   auto_vec<gimple *> stmt_worklist;
11444   hash_map<tree,tree> mapping;
11445   gimple *orig_stmt, *new_stmt;
11446   gimple_stmt_iterator epilogue_gsi;
11447   gphi_iterator epilogue_phi_gsi;
11448   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11449   basic_block *epilogue_bbs = get_loop_body (epilogue);
11450   unsigned i;
11451
11452   free (LOOP_VINFO_BBS (epilogue_vinfo));
11453   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11454
11455   /* Advance data_reference's with the number of iterations of the previous
11456      loop and its prologue.  */
11457   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11458
11459
11460   /* The EPILOGUE loop is a copy of the original loop so they share the same
11461      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11462      point to the copied statements.  We also create a mapping of all LHS' in
11463      the original loop and all the LHS' in the EPILOGUE and create worklists to
11464      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11465   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11466     {
11467       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11468            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11469         {
11470           new_stmt = epilogue_phi_gsi.phi ();
11471
11472           gcc_assert (gimple_uid (new_stmt) > 0);
11473           stmt_vinfo
11474             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11475
11476           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11477           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11478
11479           mapping.put (gimple_phi_result (orig_stmt),
11480                        gimple_phi_result (new_stmt));
11481           /* PHI nodes can not have patterns or related statements.  */
11482           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11483                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11484         }
11485
11486       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11487            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11488         {
11489           new_stmt = gsi_stmt (epilogue_gsi);
11490           if (is_gimple_debug (new_stmt))
11491             continue;
11492
11493           gcc_assert (gimple_uid (new_stmt) > 0);
11494           stmt_vinfo
11495             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11496
11497           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11498           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11499
11500           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11501             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11502
11503           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11504             {
11505               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11506               for (gimple_stmt_iterator gsi = gsi_start (seq);
11507                    !gsi_end_p (gsi); gsi_next (&gsi))
11508                 stmt_worklist.safe_push (gsi_stmt (gsi));
11509             }
11510
11511           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11512           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11513             {
11514               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11515               stmt_worklist.safe_push (stmt);
11516               /* Set BB such that the assert in
11517                 'get_initial_def_for_reduction' is able to determine that
11518                 the BB of the related stmt is inside this loop.  */
11519               gimple_set_bb (stmt,
11520                              gimple_bb (new_stmt));
11521               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11522               gcc_assert (related_vinfo == NULL
11523                           || related_vinfo == stmt_vinfo);
11524             }
11525         }
11526     }
11527
11528   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11529      using the original main loop and thus need to be updated to refer to the
11530      cloned variables used in the epilogue.  */
11531   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11532     {
11533       gimple *stmt = stmt_worklist[i];
11534       tree *new_op;
11535
11536       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11537         {
11538           tree op = gimple_op (stmt, j);
11539           if ((new_op = mapping.get(op)))
11540             gimple_set_op (stmt, j, *new_op);
11541           else
11542             {
11543               /* PR92429: The last argument of simplify_replace_tree disables
11544                  folding when replacing arguments.  This is required as
11545                  otherwise you might end up with different statements than the
11546                  ones analyzed in vect_loop_analyze, leading to different
11547                  vectorization.  */
11548               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11549                                           &find_in_mapping, &mapping, false);
11550               gimple_set_op (stmt, j, op);
11551             }
11552         }
11553     }
11554
11555   struct data_reference *dr;
11556   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11557   FOR_EACH_VEC_ELT (datarefs, i, dr)
11558     {
11559       orig_stmt = DR_STMT (dr);
11560       gcc_assert (gimple_uid (orig_stmt) > 0);
11561       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11562       /* Data references for gather loads and scatter stores do not use the
11563          updated offset we set using ADVANCE.  Instead we have to make sure the
11564          reference in the data references point to the corresponding copy of
11565          the original in the epilogue.  Make sure to update both
11566          gather/scatters recognized by dataref analysis and also other
11567          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11568       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11569       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11570           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11571         {
11572           DR_REF (dr)
11573             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11574                                      &find_in_mapping, &mapping);
11575           DR_BASE_ADDRESS (dr)
11576             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11577                                      &find_in_mapping, &mapping);
11578         }
11579       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11580       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11581       /* The vector size of the epilogue is smaller than that of the main loop
11582          so the alignment is either the same or lower. This means the dr will
11583          thus by definition be aligned.  */
11584       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11585     }
11586
11587   epilogue_vinfo->shared->datarefs_copy.release ();
11588   epilogue_vinfo->shared->save_datarefs ();
11589 }
11590
11591 /* Function vect_transform_loop.
11592
11593    The analysis phase has determined that the loop is vectorizable.
11594    Vectorize the loop - created vectorized stmts to replace the scalar
11595    stmts in the loop, and update the loop exit condition.
11596    Returns scalar epilogue loop if any.  */
11597
11598 class loop *
11599 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11600 {
11601   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11602   class loop *epilogue = NULL;
11603   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11604   int nbbs = loop->num_nodes;
11605   int i;
11606   tree niters_vector = NULL_TREE;
11607   tree step_vector = NULL_TREE;
11608   tree niters_vector_mult_vf = NULL_TREE;
11609   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11610   unsigned int lowest_vf = constant_lower_bound (vf);
11611   gimple *stmt;
11612   bool check_profitability = false;
11613   unsigned int th;
11614   bool flat = maybe_flat_loop_profile (loop);
11615
11616   DUMP_VECT_SCOPE ("vec_transform_loop");
11617
11618   loop_vinfo->shared->check_datarefs ();
11619
11620   /* Use the more conservative vectorization threshold.  If the number
11621      of iterations is constant assume the cost check has been performed
11622      by our caller.  If the threshold makes all loops profitable that
11623      run at least the (estimated) vectorization factor number of times
11624      checking is pointless, too.  */
11625   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11626   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11627     {
11628       if (dump_enabled_p ())
11629         dump_printf_loc (MSG_NOTE, vect_location,
11630                          "Profitability threshold is %d loop iterations.\n",
11631                          th);
11632       check_profitability = true;
11633     }
11634
11635   /* Make sure there exists a single-predecessor exit bb.  Do this before
11636      versioning.   */
11637   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11638   if (! single_pred_p (e->dest))
11639     {
11640       split_loop_exit_edge (e, true);
11641       if (dump_enabled_p ())
11642         dump_printf (MSG_NOTE, "split exit edge\n");
11643     }
11644
11645   /* Version the loop first, if required, so the profitability check
11646      comes first.  */
11647
11648   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11649     {
11650       class loop *sloop
11651         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11652       sloop->force_vectorize = false;
11653       check_profitability = false;
11654     }
11655
11656   /* Make sure there exists a single-predecessor exit bb also on the
11657      scalar loop copy.  Do this after versioning but before peeling
11658      so CFG structure is fine for both scalar and if-converted loop
11659      to make slpeel_duplicate_current_defs_from_edges face matched
11660      loop closed PHI nodes on the exit.  */
11661   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11662     {
11663       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11664       if (! single_pred_p (e->dest))
11665         {
11666           split_loop_exit_edge (e, true);
11667           if (dump_enabled_p ())
11668             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11669         }
11670     }
11671
11672   tree niters = vect_build_loop_niters (loop_vinfo);
11673   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11674   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11675   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11676   tree advance;
11677   drs_init_vec orig_drs_init;
11678
11679   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11680                               &step_vector, &niters_vector_mult_vf, th,
11681                               check_profitability, niters_no_overflow,
11682                               &advance);
11683   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11684       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11685     {
11686       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11687          block after loop exit.  We need to scale all that.  */
11688       basic_block preheader
11689         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11690       preheader->count
11691         = preheader->count.apply_probability
11692               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11693       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11694                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11695       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11696         = preheader->count;
11697     }
11698
11699   if (niters_vector == NULL_TREE)
11700     {
11701       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11702           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11703           && known_eq (lowest_vf, vf))
11704         {
11705           niters_vector
11706             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11707                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11708           step_vector = build_one_cst (TREE_TYPE (niters));
11709         }
11710       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11711         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11712                                      &step_vector, niters_no_overflow);
11713       else
11714         /* vect_do_peeling subtracted the number of peeled prologue
11715            iterations from LOOP_VINFO_NITERS.  */
11716         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11717                                      &niters_vector, &step_vector,
11718                                      niters_no_overflow);
11719     }
11720
11721   /* 1) Make sure the loop header has exactly two entries
11722      2) Make sure we have a preheader basic block.  */
11723
11724   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11725
11726   split_edge (loop_preheader_edge (loop));
11727
11728   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11729     /* This will deal with any possible peeling.  */
11730     vect_prepare_for_masked_peels (loop_vinfo);
11731
11732   /* Schedule the SLP instances first, then handle loop vectorization
11733      below.  */
11734   if (!loop_vinfo->slp_instances.is_empty ())
11735     {
11736       DUMP_VECT_SCOPE ("scheduling SLP instances");
11737       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11738     }
11739
11740   /* FORNOW: the vectorizer supports only loops which body consist
11741      of one basic block (header + empty latch). When the vectorizer will
11742      support more involved loop forms, the order by which the BBs are
11743      traversed need to be reconsidered.  */
11744
11745   for (i = 0; i < nbbs; i++)
11746     {
11747       basic_block bb = bbs[i];
11748       stmt_vec_info stmt_info;
11749
11750       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11751            gsi_next (&si))
11752         {
11753           gphi *phi = si.phi ();
11754           if (dump_enabled_p ())
11755             dump_printf_loc (MSG_NOTE, vect_location,
11756                              "------>vectorizing phi: %G", (gimple *) phi);
11757           stmt_info = loop_vinfo->lookup_stmt (phi);
11758           if (!stmt_info)
11759             continue;
11760
11761           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11762             vect_loop_kill_debug_uses (loop, stmt_info);
11763
11764           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11765               && !STMT_VINFO_LIVE_P (stmt_info))
11766             continue;
11767
11768           if (STMT_VINFO_VECTYPE (stmt_info)
11769               && (maybe_ne
11770                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11771               && dump_enabled_p ())
11772             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11773
11774           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11775                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11776                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11777                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11778                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11779                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11780               && ! PURE_SLP_STMT (stmt_info))
11781             {
11782               if (dump_enabled_p ())
11783                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11784               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11785             }
11786         }
11787
11788       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11789            gsi_next (&si))
11790         {
11791           gphi *phi = si.phi ();
11792           stmt_info = loop_vinfo->lookup_stmt (phi);
11793           if (!stmt_info)
11794             continue;
11795
11796           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11797               && !STMT_VINFO_LIVE_P (stmt_info))
11798             continue;
11799
11800           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11801                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11802                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11803                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11804                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11805                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11806               && ! PURE_SLP_STMT (stmt_info))
11807             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11808         }
11809
11810       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11811            !gsi_end_p (si);)
11812         {
11813           stmt = gsi_stmt (si);
11814           /* During vectorization remove existing clobber stmts.  */
11815           if (gimple_clobber_p (stmt))
11816             {
11817               unlink_stmt_vdef (stmt);
11818               gsi_remove (&si, true);
11819               release_defs (stmt);
11820             }
11821           else
11822             {
11823               /* Ignore vector stmts created in the outer loop.  */
11824               stmt_info = loop_vinfo->lookup_stmt (stmt);
11825
11826               /* vector stmts created in the outer-loop during vectorization of
11827                  stmts in an inner-loop may not have a stmt_info, and do not
11828                  need to be vectorized.  */
11829               stmt_vec_info seen_store = NULL;
11830               if (stmt_info)
11831                 {
11832                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11833                     {
11834                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11835                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11836                            !gsi_end_p (subsi); gsi_next (&subsi))
11837                         {
11838                           stmt_vec_info pat_stmt_info
11839                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11840                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11841                                                     &si, &seen_store);
11842                         }
11843                       stmt_vec_info pat_stmt_info
11844                         = STMT_VINFO_RELATED_STMT (stmt_info);
11845                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11846                                                     &si, &seen_store))
11847                         maybe_set_vectorized_backedge_value (loop_vinfo,
11848                                                              pat_stmt_info);
11849                     }
11850                   else
11851                     {
11852                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11853                                                     &seen_store))
11854                         maybe_set_vectorized_backedge_value (loop_vinfo,
11855                                                              stmt_info);
11856                     }
11857                 }
11858               gsi_next (&si);
11859               if (seen_store)
11860                 {
11861                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11862                     /* Interleaving.  If IS_STORE is TRUE, the
11863                        vectorization of the interleaving chain was
11864                        completed - free all the stores in the chain.  */
11865                     vect_remove_stores (loop_vinfo,
11866                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11867                   else
11868                     /* Free the attached stmt_vec_info and remove the stmt.  */
11869                     loop_vinfo->remove_stmt (stmt_info);
11870                 }
11871             }
11872         }
11873
11874       /* Stub out scalar statements that must not survive vectorization.
11875          Doing this here helps with grouped statements, or statements that
11876          are involved in patterns.  */
11877       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11878            !gsi_end_p (gsi); gsi_next (&gsi))
11879         {
11880           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11881           if (!call || !gimple_call_internal_p (call))
11882             continue;
11883           internal_fn ifn = gimple_call_internal_fn (call);
11884           if (ifn == IFN_MASK_LOAD)
11885             {
11886               tree lhs = gimple_get_lhs (call);
11887               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11888                 {
11889                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11890                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11891                   gsi_replace (&gsi, new_stmt, true);
11892                 }
11893             }
11894           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11895             {
11896               tree lhs = gimple_get_lhs (call);
11897               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11898                 {
11899                   tree else_arg
11900                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11901                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11902                   gsi_replace (&gsi, new_stmt, true);
11903                 }
11904             }
11905         }
11906     }                           /* BBs in loop */
11907
11908   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11909      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11910   if (integer_onep (step_vector))
11911     niters_no_overflow = true;
11912   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11913                            niters_vector, step_vector, niters_vector_mult_vf,
11914                            !niters_no_overflow);
11915
11916   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11917
11918   /* True if the final iteration might not handle a full vector's
11919      worth of scalar iterations.  */
11920   bool final_iter_may_be_partial
11921     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11922   /* The minimum number of iterations performed by the epilogue.  This
11923      is 1 when peeling for gaps because we always need a final scalar
11924      iteration.  */
11925   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11926   /* +1 to convert latch counts to loop iteration counts,
11927      -min_epilogue_iters to remove iterations that cannot be performed
11928        by the vector code.  */
11929   int bias_for_lowest = 1 - min_epilogue_iters;
11930   int bias_for_assumed = bias_for_lowest;
11931   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11932   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11933     {
11934       /* When the amount of peeling is known at compile time, the first
11935          iteration will have exactly alignment_npeels active elements.
11936          In the worst case it will have at least one.  */
11937       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11938       bias_for_lowest += lowest_vf - min_first_active;
11939       bias_for_assumed += assumed_vf - min_first_active;
11940     }
11941   /* In these calculations the "- 1" converts loop iteration counts
11942      back to latch counts.  */
11943   if (loop->any_upper_bound)
11944     {
11945       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11946       loop->nb_iterations_upper_bound
11947         = (final_iter_may_be_partial
11948            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11949                             lowest_vf) - 1
11950            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11951                              lowest_vf) - 1);
11952       if (main_vinfo
11953           /* Both peeling for alignment and peeling for gaps can end up
11954              with the scalar epilogue running for more than VF-1 iterations.  */
11955           && !main_vinfo->peeling_for_alignment
11956           && !main_vinfo->peeling_for_gaps)
11957         {
11958           unsigned int bound;
11959           poly_uint64 main_iters
11960             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11961                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11962           main_iters
11963             = upper_bound (main_iters,
11964                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11965           if (can_div_away_from_zero_p (main_iters,
11966                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11967                                         &bound))
11968             loop->nb_iterations_upper_bound
11969               = wi::umin ((bound_wide_int) (bound - 1),
11970                           loop->nb_iterations_upper_bound);
11971       }
11972   }
11973   if (loop->any_likely_upper_bound)
11974     loop->nb_iterations_likely_upper_bound
11975       = (final_iter_may_be_partial
11976          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11977                           + bias_for_lowest, lowest_vf) - 1
11978          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11979                            + bias_for_lowest, lowest_vf) - 1);
11980   if (loop->any_estimate)
11981     loop->nb_iterations_estimate
11982       = (final_iter_may_be_partial
11983          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11984                           assumed_vf) - 1
11985          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11986                            assumed_vf) - 1);
11987   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11988                                assumed_vf, flat);
11989
11990   if (dump_enabled_p ())
11991     {
11992       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11993         {
11994           dump_printf_loc (MSG_NOTE, vect_location,
11995                            "LOOP VECTORIZED\n");
11996           if (loop->inner)
11997             dump_printf_loc (MSG_NOTE, vect_location,
11998                              "OUTER LOOP VECTORIZED\n");
11999           dump_printf (MSG_NOTE, "\n");
12000         }
12001       else
12002         dump_printf_loc (MSG_NOTE, vect_location,
12003                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12004                          GET_MODE_NAME (loop_vinfo->vector_mode));
12005     }
12006
12007   /* Loops vectorized with a variable factor won't benefit from
12008      unrolling/peeling.  */
12009   if (!vf.is_constant ())
12010     {
12011       loop->unroll = 1;
12012       if (dump_enabled_p ())
12013         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12014                          " variable-length vectorization factor\n");
12015     }
12016   /* Free SLP instances here because otherwise stmt reference counting
12017      won't work.  */
12018   slp_instance instance;
12019   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12020     vect_free_slp_instance (instance);
12021   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12022   /* Clear-up safelen field since its value is invalid after vectorization
12023      since vectorized loop can have loop-carried dependencies.  */
12024   loop->safelen = 0;
12025
12026   if (epilogue)
12027     {
12028       update_epilogue_loop_vinfo (epilogue, advance);
12029
12030       epilogue->simduid = loop->simduid;
12031       epilogue->force_vectorize = loop->force_vectorize;
12032       epilogue->dont_vectorize = false;
12033     }
12034
12035   return epilogue;
12036 }
12037
12038 /* The code below is trying to perform simple optimization - revert
12039    if-conversion for masked stores, i.e. if the mask of a store is zero
12040    do not perform it and all stored value producers also if possible.
12041    For example,
12042      for (i=0; i<n; i++)
12043        if (c[i])
12044         {
12045           p1[i] += 1;
12046           p2[i] = p3[i] +2;
12047         }
12048    this transformation will produce the following semi-hammock:
12049
12050    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12051      {
12052        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12053        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12054        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12055        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12056        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12057        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12058      }
12059 */
12060
12061 void
12062 optimize_mask_stores (class loop *loop)
12063 {
12064   basic_block *bbs = get_loop_body (loop);
12065   unsigned nbbs = loop->num_nodes;
12066   unsigned i;
12067   basic_block bb;
12068   class loop *bb_loop;
12069   gimple_stmt_iterator gsi;
12070   gimple *stmt;
12071   auto_vec<gimple *> worklist;
12072   auto_purge_vect_location sentinel;
12073
12074   vect_location = find_loop_location (loop);
12075   /* Pick up all masked stores in loop if any.  */
12076   for (i = 0; i < nbbs; i++)
12077     {
12078       bb = bbs[i];
12079       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12080            gsi_next (&gsi))
12081         {
12082           stmt = gsi_stmt (gsi);
12083           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12084             worklist.safe_push (stmt);
12085         }
12086     }
12087
12088   free (bbs);
12089   if (worklist.is_empty ())
12090     return;
12091
12092   /* Loop has masked stores.  */
12093   while (!worklist.is_empty ())
12094     {
12095       gimple *last, *last_store;
12096       edge e, efalse;
12097       tree mask;
12098       basic_block store_bb, join_bb;
12099       gimple_stmt_iterator gsi_to;
12100       tree vdef, new_vdef;
12101       gphi *phi;
12102       tree vectype;
12103       tree zero;
12104
12105       last = worklist.pop ();
12106       mask = gimple_call_arg (last, 2);
12107       bb = gimple_bb (last);
12108       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12109          the same loop as if_bb.  It could be different to LOOP when two
12110          level loop-nest is vectorized and mask_store belongs to the inner
12111          one.  */
12112       e = split_block (bb, last);
12113       bb_loop = bb->loop_father;
12114       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12115       join_bb = e->dest;
12116       store_bb = create_empty_bb (bb);
12117       add_bb_to_loop (store_bb, bb_loop);
12118       e->flags = EDGE_TRUE_VALUE;
12119       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12120       /* Put STORE_BB to likely part.  */
12121       efalse->probability = profile_probability::likely ();
12122       e->probability = efalse->probability.invert ();
12123       store_bb->count = efalse->count ();
12124       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12125       if (dom_info_available_p (CDI_DOMINATORS))
12126         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12127       if (dump_enabled_p ())
12128         dump_printf_loc (MSG_NOTE, vect_location,
12129                          "Create new block %d to sink mask stores.",
12130                          store_bb->index);
12131       /* Create vector comparison with boolean result.  */
12132       vectype = TREE_TYPE (mask);
12133       zero = build_zero_cst (vectype);
12134       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12135       gsi = gsi_last_bb (bb);
12136       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12137       /* Create new PHI node for vdef of the last masked store:
12138          .MEM_2 = VDEF <.MEM_1>
12139          will be converted to
12140          .MEM.3 = VDEF <.MEM_1>
12141          and new PHI node will be created in join bb
12142          .MEM_2 = PHI <.MEM_1, .MEM_3>
12143       */
12144       vdef = gimple_vdef (last);
12145       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12146       gimple_set_vdef (last, new_vdef);
12147       phi = create_phi_node (vdef, join_bb);
12148       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12149
12150       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12151       while (true)
12152         {
12153           gimple_stmt_iterator gsi_from;
12154           gimple *stmt1 = NULL;
12155
12156           /* Move masked store to STORE_BB.  */
12157           last_store = last;
12158           gsi = gsi_for_stmt (last);
12159           gsi_from = gsi;
12160           /* Shift GSI to the previous stmt for further traversal.  */
12161           gsi_prev (&gsi);
12162           gsi_to = gsi_start_bb (store_bb);
12163           gsi_move_before (&gsi_from, &gsi_to);
12164           /* Setup GSI_TO to the non-empty block start.  */
12165           gsi_to = gsi_start_bb (store_bb);
12166           if (dump_enabled_p ())
12167             dump_printf_loc (MSG_NOTE, vect_location,
12168                              "Move stmt to created bb\n%G", last);
12169           /* Move all stored value producers if possible.  */
12170           while (!gsi_end_p (gsi))
12171             {
12172               tree lhs;
12173               imm_use_iterator imm_iter;
12174               use_operand_p use_p;
12175               bool res;
12176
12177               /* Skip debug statements.  */
12178               if (is_gimple_debug (gsi_stmt (gsi)))
12179                 {
12180                   gsi_prev (&gsi);
12181                   continue;
12182                 }
12183               stmt1 = gsi_stmt (gsi);
12184               /* Do not consider statements writing to memory or having
12185                  volatile operand.  */
12186               if (gimple_vdef (stmt1)
12187                   || gimple_has_volatile_ops (stmt1))
12188                 break;
12189               gsi_from = gsi;
12190               gsi_prev (&gsi);
12191               lhs = gimple_get_lhs (stmt1);
12192               if (!lhs)
12193                 break;
12194
12195               /* LHS of vectorized stmt must be SSA_NAME.  */
12196               if (TREE_CODE (lhs) != SSA_NAME)
12197                 break;
12198
12199               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12200                 {
12201                   /* Remove dead scalar statement.  */
12202                   if (has_zero_uses (lhs))
12203                     {
12204                       gsi_remove (&gsi_from, true);
12205                       continue;
12206                     }
12207                 }
12208
12209               /* Check that LHS does not have uses outside of STORE_BB.  */
12210               res = true;
12211               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12212                 {
12213                   gimple *use_stmt;
12214                   use_stmt = USE_STMT (use_p);
12215                   if (is_gimple_debug (use_stmt))
12216                     continue;
12217                   if (gimple_bb (use_stmt) != store_bb)
12218                     {
12219                       res = false;
12220                       break;
12221                     }
12222                 }
12223               if (!res)
12224                 break;
12225
12226               if (gimple_vuse (stmt1)
12227                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12228                 break;
12229
12230               /* Can move STMT1 to STORE_BB.  */
12231               if (dump_enabled_p ())
12232                 dump_printf_loc (MSG_NOTE, vect_location,
12233                                  "Move stmt to created bb\n%G", stmt1);
12234               gsi_move_before (&gsi_from, &gsi_to);
12235               /* Shift GSI_TO for further insertion.  */
12236               gsi_prev (&gsi_to);
12237             }
12238           /* Put other masked stores with the same mask to STORE_BB.  */
12239           if (worklist.is_empty ()
12240               || gimple_call_arg (worklist.last (), 2) != mask
12241               || worklist.last () != stmt1)
12242             break;
12243           last = worklist.pop ();
12244         }
12245       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12246     }
12247 }
12248
12249 /* Decide whether it is possible to use a zero-based induction variable
12250    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12251    the value that the induction variable must be able to hold in order
12252    to ensure that the rgroups eventually have no active vector elements.
12253    Return -1 otherwise.  */
12254
12255 widest_int
12256 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12257 {
12258   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12259   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12260   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12261
12262   /* Calculate the value that the induction variable must be able
12263      to hit in order to ensure that we end the loop with an all-false mask.
12264      This involves adding the maximum number of inactive trailing scalar
12265      iterations.  */
12266   widest_int iv_limit = -1;
12267   if (max_loop_iterations (loop, &iv_limit))
12268     {
12269       if (niters_skip)
12270         {
12271           /* Add the maximum number of skipped iterations to the
12272              maximum iteration count.  */
12273           if (TREE_CODE (niters_skip) == INTEGER_CST)
12274             iv_limit += wi::to_widest (niters_skip);
12275           else
12276             iv_limit += max_vf - 1;
12277         }
12278       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12279         /* Make a conservatively-correct assumption.  */
12280         iv_limit += max_vf - 1;
12281
12282       /* IV_LIMIT is the maximum number of latch iterations, which is also
12283          the maximum in-range IV value.  Round this value down to the previous
12284          vector alignment boundary and then add an extra full iteration.  */
12285       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12286       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12287     }
12288   return iv_limit;
12289 }
12290
12291 /* For the given rgroup_controls RGC, check whether an induction variable
12292    would ever hit a value that produces a set of all-false masks or zero
12293    lengths before wrapping around.  Return true if it's possible to wrap
12294    around before hitting the desirable value, otherwise return false.  */
12295
12296 bool
12297 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12298 {
12299   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12300
12301   if (iv_limit == -1)
12302     return true;
12303
12304   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12305   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12306   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12307
12308   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12309     return true;
12310
12311   return false;
12312 }