gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       /* For now vect_get_loop_mask only supports integer mode masks
1466          when we need to split it.  */
1467       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1469         {
1470           ok = false;
1471           break;
1472         }
1473
1474       /* If iv_type is usable as compare type use that - we can elide the
1475          saturation in that case.   */
1476       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1477         {
1478           tree cmp_vectype
1479             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1480           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481             rgc.compare_type = cmp_vectype;
1482         }
1483       if (!rgc.compare_type)
1484         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1485           {
1486             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1487             if (cmp_bits >= min_ni_width
1488                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1489               {
1490                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491                 if (!cmp_type)
1492                   continue;
1493
1494                 /* Check whether we can produce the mask with cmp_type.  */
1495                 tree cmp_vectype
1496                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1497                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1498                   {
1499                     rgc.compare_type = cmp_vectype;
1500                     break;
1501                   }
1502               }
1503         }
1504       if (!rgc.compare_type)
1505         {
1506           ok = false;
1507           break;
1508         }
1509     }
1510   if (!ok)
1511     {
1512       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513       return false;
1514     }
1515
1516   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519   return true;
1520 }
1521
1522 /* Check whether we can use vector access with length based on precison
1523    comparison.  So far, to keep it simple, we only allow the case that the
1524    precision of the target supported length is larger than the precision
1525    required by loop niters.  */
1526
1527 static bool
1528 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1529 {
1530   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531     return false;
1532
1533   machine_mode len_load_mode, len_store_mode;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535          .exists (&len_load_mode))
1536     return false;
1537   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538          .exists (&len_store_mode))
1539     return false;
1540
1541   signed char partial_load_bias = internal_len_load_store_bias
1542     (IFN_LEN_LOAD, len_load_mode);
1543
1544   signed char partial_store_bias = internal_len_load_store_bias
1545     (IFN_LEN_STORE, len_store_mode);
1546
1547   gcc_assert (partial_load_bias == partial_store_bias);
1548
1549   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550     return false;
1551
1552   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553      len_loads with a length of zero.  In order to avoid that we prohibit
1554      more than one loop length here.  */
1555   if (partial_load_bias == -1
1556       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557     return false;
1558
1559   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1560
1561   unsigned int max_nitems_per_iter = 1;
1562   unsigned int i;
1563   rgroup_controls *rgl;
1564   /* Find the maximum number of items per iteration for every rgroup.  */
1565   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1566     {
1567       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1569     }
1570
1571   /* Work out how many bits we need to represent the length limit.  */
1572   unsigned int min_ni_prec
1573     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1574
1575   /* Now use the maximum of below precisions for one suitable IV type:
1576      - the IV's natural precision
1577      - the precision needed to hold: the maximum number of scalar
1578        iterations multiplied by the scale factor (min_ni_prec above)
1579      - the Pmode precision
1580
1581      If min_ni_prec is less than the precision of the current niters,
1582      we perfer to still use the niters type.  Prefer to use Pmode and
1583      wider IV to avoid narrow conversions.  */
1584
1585   unsigned int ni_prec
1586     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587   min_ni_prec = MAX (min_ni_prec, ni_prec);
1588   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1589
1590   tree iv_type = NULL_TREE;
1591   opt_scalar_int_mode tmode_iter;
1592   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1593     {
1594       scalar_mode tmode = tmode_iter.require ();
1595       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1596
1597       /* ??? Do we really want to construct one IV whose precision exceeds
1598          BITS_PER_WORD?  */
1599       if (tbits > BITS_PER_WORD)
1600         break;
1601
1602       /* Find the first available standard integral type.  */
1603       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1604         {
1605           iv_type = build_nonstandard_integer_type (tbits, true);
1606           break;
1607         }
1608     }
1609
1610   if (!iv_type)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614                          "can't vectorize with length-based partial vectors"
1615                          " because there is no suitable iv type.\n");
1616       return false;
1617     }
1618
1619   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1622
1623   return true;
1624 }
1625
1626 /* Calculate the cost of one scalar iteration of the loop.  */
1627 static void
1628 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1629 {
1630   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632   int nbbs = loop->num_nodes, factor;
1633   int innerloop_iters, i;
1634
1635   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1636
1637   /* Gather costs for statements in the scalar loop.  */
1638
1639   /* FORNOW.  */
1640   innerloop_iters = 1;
1641   if (loop->inner)
1642     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       gimple_stmt_iterator si;
1647       basic_block bb = bbs[i];
1648
1649       if (bb->loop_father == loop->inner)
1650         factor = innerloop_iters;
1651       else
1652         factor = 1;
1653
1654       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1655         {
1656           gimple *stmt = gsi_stmt (si);
1657           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1658
1659           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1660             continue;
1661
1662           /* Skip stmts that are not vectorized inside the loop.  */
1663           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665               && (!STMT_VINFO_LIVE_P (vstmt_info)
1666                   || !VECTORIZABLE_CYCLE_DEF
1667                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668             continue;
1669
1670           vect_cost_for_stmt kind;
1671           if (STMT_VINFO_DATA_REF (stmt_info))
1672             {
1673               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674                kind = scalar_load;
1675              else
1676                kind = scalar_store;
1677             }
1678           else if (vect_nop_conversion_p (stmt_info))
1679             continue;
1680           else
1681             kind = scalar_stmt;
1682
1683           /* We are using vect_prologue here to avoid scaling twice
1684              by the inner loop factor.  */
1685           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686                             factor, kind, stmt_info, 0, vect_prologue);
1687         }
1688     }
1689
1690   /* Now accumulate cost.  */
1691   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1692   add_stmt_costs (loop_vinfo->scalar_costs,
1693                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694   loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 }
1696
1697
1698 /* Function vect_analyze_loop_form.
1699
1700    Verify that certain CFG restrictions hold, including:
1701    - the loop has a pre-header
1702    - the loop has a single entry and exit
1703    - the loop exit condition is simple enough
1704    - the number of iterations can be analyzed, i.e, a countable loop.  The
1705      niter could be analyzed under some assumptions.  */
1706
1707 opt_result
1708 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1709 {
1710   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1711
1712   edge exit_e = vec_init_loop_exit_info (loop);
1713   if (!exit_e)
1714     return opt_result::failure_at (vect_location,
1715                                    "not vectorized:"
1716                                    " could not determine main exit from"
1717                                    " loop with multiple exits.\n");
1718   info->loop_exit = exit_e;
1719   if (dump_enabled_p ())
1720       dump_printf_loc (MSG_NOTE, vect_location,
1721                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1722                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1723
1724   /* Different restrictions apply when we are considering an inner-most loop,
1725      vs. an outer (nested) loop.
1726      (FORNOW. May want to relax some of these restrictions in the future).  */
1727
1728   info->inner_loop_cond = NULL;
1729   if (!loop->inner)
1730     {
1731       /* Inner-most loop.  We currently require that the number of BBs is
1732          exactly 2 (the header and latch).  Vectorizable inner-most loops
1733          look like this:
1734
1735                         (pre-header)
1736                            |
1737                           header <--------+
1738                            | |            |
1739                            | +--> latch --+
1740                            |
1741                         (exit-bb)  */
1742
1743       if (loop->num_nodes != 2)
1744         return opt_result::failure_at (vect_location,
1745                                        "not vectorized:"
1746                                        " control flow in loop.\n");
1747
1748       if (empty_block_p (loop->header))
1749         return opt_result::failure_at (vect_location,
1750                                        "not vectorized: empty loop.\n");
1751     }
1752   else
1753     {
1754       class loop *innerloop = loop->inner;
1755       edge entryedge;
1756
1757       /* Nested loop. We currently require that the loop is doubly-nested,
1758          contains a single inner loop, and the number of BBs is exactly 5.
1759          Vectorizable outer-loops look like this:
1760
1761                         (pre-header)
1762                            |
1763                           header <---+
1764                            |         |
1765                           inner-loop |
1766                            |         |
1767                           tail ------+
1768                            |
1769                         (exit-bb)
1770
1771          The inner-loop has the properties expected of inner-most loops
1772          as described above.  */
1773
1774       if ((loop->inner)->inner || (loop->inner)->next)
1775         return opt_result::failure_at (vect_location,
1776                                        "not vectorized:"
1777                                        " multiple nested loops.\n");
1778
1779       if (loop->num_nodes != 5)
1780         return opt_result::failure_at (vect_location,
1781                                        "not vectorized:"
1782                                        " control flow in loop.\n");
1783
1784       entryedge = loop_preheader_edge (innerloop);
1785       if (entryedge->src != loop->header
1786           || !single_exit (innerloop)
1787           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788         return opt_result::failure_at (vect_location,
1789                                        "not vectorized:"
1790                                        " unsupported outerloop form.\n");
1791
1792       /* Analyze the inner-loop.  */
1793       vect_loop_form_info inner;
1794       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1795       if (!res)
1796         {
1797           if (dump_enabled_p ())
1798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799                              "not vectorized: Bad inner loop.\n");
1800           return res;
1801         }
1802
1803       /* Don't support analyzing niter under assumptions for inner
1804          loop.  */
1805       if (!integer_onep (inner.assumptions))
1806         return opt_result::failure_at (vect_location,
1807                                        "not vectorized: Bad inner loop.\n");
1808
1809       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810         return opt_result::failure_at (vect_location,
1811                                        "not vectorized: inner-loop count not"
1812                                        " invariant.\n");
1813
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_NOTE, vect_location,
1816                          "Considering outer-loop vectorization.\n");
1817       info->inner_loop_cond = inner.conds[0];
1818     }
1819
1820   if (!single_exit (loop))
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized: multiple exits.\n");
1823   if (EDGE_COUNT (loop->header->preds) != 2)
1824     return opt_result::failure_at (vect_location,
1825                                    "not vectorized:"
1826                                    " too many incoming edges.\n");
1827
1828   /* We assume that the loop exit condition is at the end of the loop. i.e,
1829      that the loop is represented as a do-while (with a proper if-guard
1830      before the loop if needed), where the loop header contains all the
1831      executable statements, and the latch is empty.  */
1832   if (!empty_block_p (loop->latch)
1833       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1834     return opt_result::failure_at (vect_location,
1835                                    "not vectorized: latch block not empty.\n");
1836
1837   /* Make sure the exit is not abnormal.  */
1838   if (exit_e->flags & EDGE_ABNORMAL)
1839     return opt_result::failure_at (vect_location,
1840                                    "not vectorized:"
1841                                    " abnormal loop exit edge.\n");
1842
1843   info->conds
1844     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1845                             &info->number_of_iterations,
1846                             &info->number_of_iterationsm1);
1847
1848   if (info->conds.is_empty ())
1849     return opt_result::failure_at
1850       (vect_location,
1851        "not vectorized: complicated exit condition.\n");
1852
1853   /* Determine what the primary and alternate exit conds are.  */
1854   for (unsigned i = 0; i < info->conds.length (); i++)
1855     {
1856       gcond *cond = info->conds[i];
1857       if (exit_e->src == gimple_bb (cond))
1858         std::swap (info->conds[0], info->conds[i]);
1859     }
1860
1861   if (integer_zerop (info->assumptions)
1862       || !info->number_of_iterations
1863       || chrec_contains_undetermined (info->number_of_iterations))
1864     return opt_result::failure_at
1865       (info->conds[0],
1866        "not vectorized: number of iterations cannot be computed.\n");
1867
1868   if (integer_zerop (info->number_of_iterations))
1869     return opt_result::failure_at
1870       (info->conds[0],
1871        "not vectorized: number of iterations = 0.\n");
1872
1873   if (!(tree_fits_shwi_p (info->number_of_iterations)
1874         && tree_to_shwi (info->number_of_iterations) > 0))
1875     {
1876       if (dump_enabled_p ())
1877         {
1878           dump_printf_loc (MSG_NOTE, vect_location,
1879                            "Symbolic number of iterations is ");
1880           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881           dump_printf (MSG_NOTE, "\n");
1882         }
1883     }
1884
1885   return opt_result::success ();
1886 }
1887
1888 /* Create a loop_vec_info for LOOP with SHARED and the
1889    vect_analyze_loop_form result.  */
1890
1891 loop_vec_info
1892 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893                         const vect_loop_form_info *info,
1894                         loop_vec_info main_loop_info)
1895 {
1896   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901   /* Also record the assumptions for versioning.  */
1902   if (!integer_onep (info->assumptions) && !main_loop_info)
1903     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1904
1905   for (gcond *cond : info->conds)
1906     {
1907       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1909     }
1910
1911   for (unsigned i = 1; i < info->conds.length (); i ++)
1912     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1913   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1914
1915   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1916
1917   if (info->inner_loop_cond)
1918     {
1919       stmt_vec_info inner_loop_cond_info
1920         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922       /* If we have an estimate on the number of iterations of the inner
1923          loop use that to limit the scale for costing, otherwise use
1924          --param vect-inner-loop-cost-factor literally.  */
1925       widest_int nit;
1926       if (estimated_stmt_executions (loop->inner, &nit))
1927         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1929     }
1930
1931   return loop_vinfo;
1932 }
1933
1934
1935
1936 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937    statements update the vectorization factor.  */
1938
1939 static void
1940 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1941 {
1942   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944   int nbbs = loop->num_nodes;
1945   poly_uint64 vectorization_factor;
1946   int i;
1947
1948   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1949
1950   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951   gcc_assert (known_ne (vectorization_factor, 0U));
1952
1953   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954      vectorization factor of the loop is the unrolling factor required by
1955      the SLP instances.  If that unrolling factor is 1, we say, that we
1956      perform pure SLP on loop - cross iteration parallelism is not
1957      exploited.  */
1958   bool only_slp_in_loop = true;
1959   for (i = 0; i < nbbs; i++)
1960     {
1961       basic_block bb = bbs[i];
1962       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1963            gsi_next (&si))
1964         {
1965           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966           if (!stmt_info)
1967             continue;
1968           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970               && !PURE_SLP_STMT (stmt_info))
1971             /* STMT needs both SLP and loop-based vectorization.  */
1972             only_slp_in_loop = false;
1973         }
1974       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1975            gsi_next (&si))
1976         {
1977           if (is_gimple_debug (gsi_stmt (si)))
1978             continue;
1979           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1980           stmt_info = vect_stmt_to_vectorize (stmt_info);
1981           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983               && !PURE_SLP_STMT (stmt_info))
1984             /* STMT needs both SLP and loop-based vectorization.  */
1985             only_slp_in_loop = false;
1986         }
1987     }
1988
1989   if (only_slp_in_loop)
1990     {
1991       if (dump_enabled_p ())
1992         dump_printf_loc (MSG_NOTE, vect_location,
1993                          "Loop contains only SLP stmts\n");
1994       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1995     }
1996   else
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_NOTE, vect_location,
2000                          "Loop contains SLP and non-SLP stmts\n");
2001       /* Both the vectorization factor and unroll factor have the form
2002          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003          so they must have a common multiple.  */
2004       vectorization_factor
2005         = force_common_multiple (vectorization_factor,
2006                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2007     }
2008
2009   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010   if (dump_enabled_p ())
2011     {
2012       dump_printf_loc (MSG_NOTE, vect_location,
2013                        "Updating vectorization factor to ");
2014       dump_dec (MSG_NOTE, vectorization_factor);
2015       dump_printf (MSG_NOTE, ".\n");
2016     }
2017 }
2018
2019 /* Return true if STMT_INFO describes a double reduction phi and if
2020    the other phi in the reduction is also relevant for vectorization.
2021    This rejects cases such as:
2022
2023       outer1:
2024         x_1 = PHI <x_3(outer2), ...>;
2025         ...
2026
2027       inner:
2028         x_2 = ...;
2029         ...
2030
2031       outer2:
2032         x_3 = PHI <x_2(inner)>;
2033
2034    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2035
2036 static bool
2037 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2038 {
2039   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040     return false;
2041
2042   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2043 }
2044
2045 /* Function vect_analyze_loop_operations.
2046
2047    Scan the loop stmts and make sure they are all vectorizable.  */
2048
2049 static opt_result
2050 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2051 {
2052   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054   int nbbs = loop->num_nodes;
2055   int i;
2056   stmt_vec_info stmt_info;
2057   bool need_to_vectorize = false;
2058   bool ok;
2059
2060   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2061
2062   auto_vec<stmt_info_for_cost> cost_vec;
2063
2064   for (i = 0; i < nbbs; i++)
2065     {
2066       basic_block bb = bbs[i];
2067
2068       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2069            gsi_next (&si))
2070         {
2071           gphi *phi = si.phi ();
2072           ok = true;
2073
2074           stmt_info = loop_vinfo->lookup_stmt (phi);
2075           if (dump_enabled_p ())
2076             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077                              (gimple *) phi);
2078           if (virtual_operand_p (gimple_phi_result (phi)))
2079             continue;
2080
2081           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082              (i.e., a phi in the tail of the outer-loop).  */
2083           if (! is_loop_header_bb_p (bb))
2084             {
2085               /* FORNOW: we currently don't support the case that these phis
2086                  are not used in the outerloop (unless it is double reduction,
2087                  i.e., this phi is vect_reduction_def), cause this case
2088                  requires to actually do something here.  */
2089               if (STMT_VINFO_LIVE_P (stmt_info)
2090                   && !vect_active_double_reduction_p (stmt_info))
2091                 return opt_result::failure_at (phi,
2092                                                "Unsupported loop-closed phi"
2093                                                " in outer-loop.\n");
2094
2095               /* If PHI is used in the outer loop, we check that its operand
2096                  is defined in the inner loop.  */
2097               if (STMT_VINFO_RELEVANT_P (stmt_info))
2098                 {
2099                   tree phi_op;
2100
2101                   if (gimple_phi_num_args (phi) != 1)
2102                     return opt_result::failure_at (phi, "unsupported phi");
2103
2104                   phi_op = PHI_ARG_DEF (phi, 0);
2105                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106                   if (!op_def_info)
2107                     return opt_result::failure_at (phi, "unsupported phi\n");
2108
2109                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110                       && (STMT_VINFO_RELEVANT (op_def_info)
2111                           != vect_used_in_outer_by_reduction))
2112                     return opt_result::failure_at (phi, "unsupported phi\n");
2113
2114                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2116                            == vect_double_reduction_def))
2117                       && !vectorizable_lc_phi (loop_vinfo,
2118                                                stmt_info, NULL, NULL))
2119                     return opt_result::failure_at (phi, "unsupported phi\n");
2120                 }
2121
2122               continue;
2123             }
2124
2125           gcc_assert (stmt_info);
2126
2127           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128                || STMT_VINFO_LIVE_P (stmt_info))
2129               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131             /* A scalar-dependence cycle that we don't support.  */
2132             return opt_result::failure_at (phi,
2133                                            "not vectorized:"
2134                                            " scalar dependence cycle.\n");
2135
2136           if (STMT_VINFO_RELEVANT_P (stmt_info))
2137             {
2138               need_to_vectorize = true;
2139               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140                   && ! PURE_SLP_STMT (stmt_info))
2141                 ok = vectorizable_induction (loop_vinfo,
2142                                              stmt_info, NULL, NULL,
2143                                              &cost_vec);
2144               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2146                             == vect_double_reduction_def)
2147                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148                        && ! PURE_SLP_STMT (stmt_info))
2149                 ok = vectorizable_reduction (loop_vinfo,
2150                                              stmt_info, NULL, NULL, &cost_vec);
2151               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152                         == vect_first_order_recurrence)
2153                        && ! PURE_SLP_STMT (stmt_info))
2154                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155                                            &cost_vec);
2156             }
2157
2158           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2159           if (ok
2160               && STMT_VINFO_LIVE_P (stmt_info)
2161               && !PURE_SLP_STMT (stmt_info))
2162             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163                                               -1, false, &cost_vec);
2164
2165           if (!ok)
2166             return opt_result::failure_at (phi,
2167                                            "not vectorized: relevant phi not "
2168                                            "supported: %G",
2169                                            static_cast <gimple *> (phi));
2170         }
2171
2172       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2173            gsi_next (&si))
2174         {
2175           gimple *stmt = gsi_stmt (si);
2176           if (!gimple_clobber_p (stmt)
2177               && !is_gimple_debug (stmt))
2178             {
2179               opt_result res
2180                 = vect_analyze_stmt (loop_vinfo,
2181                                      loop_vinfo->lookup_stmt (stmt),
2182                                      &need_to_vectorize,
2183                                      NULL, NULL, &cost_vec);
2184               if (!res)
2185                 return res;
2186             }
2187         }
2188     } /* bbs */
2189
2190   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2191
2192   /* All operations in the loop are either irrelevant (deal with loop
2193      control, or dead), or only used outside the loop and can be moved
2194      out of the loop (e.g. invariants, inductions).  The loop can be
2195      optimized away by scalar optimizations.  We're better off not
2196      touching this loop.  */
2197   if (!need_to_vectorize)
2198     {
2199       if (dump_enabled_p ())
2200         dump_printf_loc (MSG_NOTE, vect_location,
2201                          "All the computation can be taken out of the loop.\n");
2202       return opt_result::failure_at
2203         (vect_location,
2204          "not vectorized: redundant loop. no profit to vectorize.\n");
2205     }
2206
2207   return opt_result::success ();
2208 }
2209
2210 /* Return true if we know that the iteration count is smaller than the
2211    vectorization factor.  Return false if it isn't, or if we can't be sure
2212    either way.  */
2213
2214 static bool
2215 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2216 {
2217   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2218
2219   HOST_WIDE_INT max_niter;
2220   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222   else
2223     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2224
2225   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226     return true;
2227
2228   return false;
2229 }
2230
2231 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2232    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2233    definitely no, or -1 if it's worth retrying.  */
2234
2235 static int
2236 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237                            unsigned *suggested_unroll_factor)
2238 {
2239   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2241
2242   /* Only loops that can handle partially-populated vectors can have iteration
2243      counts less than the vectorization factor.  */
2244   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245       && vect_known_niters_smaller_than_vf (loop_vinfo))
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "not vectorized: iteration count smaller than "
2250                          "vectorization factor.\n");
2251       return 0;
2252     }
2253
2254   /* If we know the number of iterations we can do better, for the
2255      epilogue we can also decide whether the main loop leaves us
2256      with enough iterations, prefering a smaller vector epilog then
2257      also possibly used for the case we skip the vector loop.  */
2258   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2259     {
2260       widest_int scalar_niters
2261         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2263         {
2264           loop_vec_info orig_loop_vinfo
2265             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266           unsigned lowest_vf
2267             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268           int prolog_peeling = 0;
2269           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271           if (prolog_peeling >= 0
2272               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273                            lowest_vf))
2274             {
2275               unsigned gap
2276                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278                                % lowest_vf + gap);
2279             }
2280         }
2281       /* Reject vectorizing for a single scalar iteration, even if
2282          we could in principle implement that using partial vectors.  */
2283       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284       if (scalar_niters <= peeling_gap + 1)
2285         {
2286           if (dump_enabled_p ())
2287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288                              "not vectorized: loop only has a single "
2289                              "scalar iteration.\n");
2290           return 0;
2291         }
2292
2293       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294         {
2295           /* Check that the loop processes at least one full vector.  */
2296           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297           if (known_lt (scalar_niters, vf))
2298             {
2299               if (dump_enabled_p ())
2300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                                  "loop does not have enough iterations "
2302                                  "to support vectorization.\n");
2303               return 0;
2304             }
2305
2306           /* If we need to peel an extra epilogue iteration to handle data
2307              accesses with gaps, check that there are enough scalar iterations
2308              available.
2309
2310              The check above is redundant with this one when peeling for gaps,
2311              but the distinction is useful for diagnostics.  */
2312           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313               && known_le (scalar_niters, vf))
2314             {
2315               if (dump_enabled_p ())
2316                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                                  "loop does not have enough iterations "
2318                                  "to support peeling for gaps.\n");
2319               return 0;
2320             }
2321         }
2322     }
2323
2324   /* If using the "very cheap" model. reject cases in which we'd keep
2325      a copy of the scalar code (even if we might be able to vectorize it).  */
2326   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                          "some scalar iterations would need to be peeled\n");
2334       return 0;
2335     }
2336
2337   int min_profitable_iters, min_profitable_estimate;
2338   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339                                       &min_profitable_estimate,
2340                                       suggested_unroll_factor);
2341
2342   if (min_profitable_iters < 0)
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vectorization not profitable.\n");
2347       if (dump_enabled_p ())
2348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349                          "not vectorized: vector version will never be "
2350                          "profitable.\n");
2351       return -1;
2352     }
2353
2354   int min_scalar_loop_bound = (param_min_vect_loop_bound
2355                                * assumed_vf);
2356
2357   /* Use the cost model only if it is more conservative than user specified
2358      threshold.  */
2359   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360                                     min_profitable_iters);
2361
2362   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2363
2364   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                          "not vectorized: vectorization not profitable.\n");
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_NOTE, vect_location,
2372                          "not vectorized: iteration count smaller than user "
2373                          "specified loop bound parameter or minimum profitable "
2374                          "iterations (whichever is more conservative).\n");
2375       return 0;
2376     }
2377
2378   /* The static profitablity threshold min_profitable_estimate includes
2379      the cost of having to check at runtime whether the scalar loop
2380      should be used instead.  If it turns out that we don't need or want
2381      such a check, the threshold we should use for the static estimate
2382      is simply the point at which the vector loop becomes more profitable
2383      than the scalar loop.  */
2384   if (min_profitable_estimate > min_profitable_iters
2385       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2389     {
2390       if (dump_enabled_p ())
2391         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392                          " choice between the scalar and vector loops\n");
2393       min_profitable_estimate = min_profitable_iters;
2394     }
2395
2396   /* If the vector loop needs multiple iterations to be beneficial then
2397      things are probably too close to call, and the conservative thing
2398      would be to stick with the scalar code.  */
2399   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2401     {
2402       if (dump_enabled_p ())
2403         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404                          "one iteration of the vector loop would be"
2405                          " more expensive than the equivalent number of"
2406                          " iterations of the scalar loop\n");
2407       return 0;
2408     }
2409
2410   HOST_WIDE_INT estimated_niter;
2411
2412   /* If we are vectorizing an epilogue then we know the maximum number of
2413      scalar iterations it will cover is at least one lower than the
2414      vectorization factor of the main loop.  */
2415   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416     estimated_niter
2417       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418   else
2419     {
2420       estimated_niter = estimated_stmt_executions_int (loop);
2421       if (estimated_niter == -1)
2422         estimated_niter = likely_max_stmt_executions_int (loop);
2423     }
2424   if (estimated_niter != -1
2425       && ((unsigned HOST_WIDE_INT) estimated_niter
2426           < MAX (th, (unsigned) min_profitable_estimate)))
2427     {
2428       if (dump_enabled_p ())
2429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430                          "not vectorized: estimated iteration count too "
2431                          "small.\n");
2432       if (dump_enabled_p ())
2433         dump_printf_loc (MSG_NOTE, vect_location,
2434                          "not vectorized: estimated iteration count smaller "
2435                          "than specified loop bound parameter or minimum "
2436                          "profitable iterations (whichever is more "
2437                          "conservative).\n");
2438       return -1;
2439     }
2440
2441   return 1;
2442 }
2443
2444 static opt_result
2445 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446                            vec<data_reference_p> *datarefs,
2447                            unsigned int *n_stmts)
2448 {
2449   *n_stmts = 0;
2450   for (unsigned i = 0; i < loop->num_nodes; i++)
2451     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2452          !gsi_end_p (gsi); gsi_next (&gsi))
2453       {
2454         gimple *stmt = gsi_stmt (gsi);
2455         if (is_gimple_debug (stmt))
2456           continue;
2457         ++(*n_stmts);
2458         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459                                                         NULL, 0);
2460         if (!res)
2461           {
2462             if (is_gimple_call (stmt) && loop->safelen)
2463               {
2464                 tree fndecl = gimple_call_fndecl (stmt), op;
2465                 if (fndecl == NULL_TREE
2466                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2467                   {
2468                     fndecl = gimple_call_arg (stmt, 0);
2469                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470                     fndecl = TREE_OPERAND (fndecl, 0);
2471                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2472                   }
2473                 if (fndecl != NULL_TREE)
2474                   {
2475                     cgraph_node *node = cgraph_node::get (fndecl);
2476                     if (node != NULL && node->simd_clones != NULL)
2477                       {
2478                         unsigned int j, n = gimple_call_num_args (stmt);
2479                         for (j = 0; j < n; j++)
2480                           {
2481                             op = gimple_call_arg (stmt, j);
2482                             if (DECL_P (op)
2483                                 || (REFERENCE_CLASS_P (op)
2484                                     && get_base_address (op)))
2485                               break;
2486                           }
2487                         op = gimple_call_lhs (stmt);
2488                         /* Ignore #pragma omp declare simd functions
2489                            if they don't have data references in the
2490                            call stmt itself.  */
2491                         if (j == n
2492                             && !(op
2493                                  && (DECL_P (op)
2494                                      || (REFERENCE_CLASS_P (op)
2495                                          && get_base_address (op)))))
2496                           continue;
2497                       }
2498                   }
2499               }
2500             return res;
2501           }
2502         /* If dependence analysis will give up due to the limit on the
2503            number of datarefs stop here and fail fatally.  */
2504         if (datarefs->length ()
2505             > (unsigned)param_loop_max_datarefs_for_datadeps)
2506           return opt_result::failure_at (stmt, "exceeded param "
2507                                          "loop-max-datarefs-for-datadeps\n");
2508       }
2509   return opt_result::success ();
2510 }
2511
2512 /* Look for SLP-only access groups and turn each individual access into its own
2513    group.  */
2514 static void
2515 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2516 {
2517   unsigned int i;
2518   struct data_reference *dr;
2519
2520   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2521
2522   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523   FOR_EACH_VEC_ELT (datarefs, i, dr)
2524     {
2525       gcc_assert (DR_REF (dr));
2526       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2527
2528       /* Check if the load is a part of an interleaving chain.  */
2529       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2530         {
2531           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533           unsigned int group_size = DR_GROUP_SIZE (first_element);
2534
2535           /* Check if SLP-only groups.  */
2536           if (!STMT_SLP_TYPE (stmt_info)
2537               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2538             {
2539               /* Dissolve the group.  */
2540               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2541
2542               stmt_vec_info vinfo = first_element;
2543               while (vinfo)
2544                 {
2545                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548                   DR_GROUP_SIZE (vinfo) = 1;
2549                   if (STMT_VINFO_STRIDED_P (first_element)
2550                       /* We cannot handle stores with gaps.  */
2551                       || DR_IS_WRITE (dr_info->dr))
2552                     {
2553                       STMT_VINFO_STRIDED_P (vinfo) = true;
2554                       DR_GROUP_GAP (vinfo) = 0;
2555                     }
2556                   else
2557                     DR_GROUP_GAP (vinfo) = group_size - 1;
2558                   /* Duplicate and adjust alignment info, it needs to
2559                      be present on each group leader, see dr_misalignment.  */
2560                   if (vinfo != first_element)
2561                     {
2562                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563                       dr_info2->target_alignment = dr_info->target_alignment;
2564                       int misalignment = dr_info->misalignment;
2565                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2566                         {
2567                           HOST_WIDE_INT diff
2568                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570                           unsigned HOST_WIDE_INT align_c
2571                             = dr_info->target_alignment.to_constant ();
2572                           misalignment = (misalignment + diff) % align_c;
2573                         }
2574                       dr_info2->misalignment = misalignment;
2575                     }
2576                   vinfo = next;
2577                 }
2578             }
2579         }
2580     }
2581 }
2582
2583 /* Determine if operating on full vectors for LOOP_VINFO might leave
2584    some scalar iterations still to do.  If so, decide how we should
2585    handle those scalar iterations.  The possibilities are:
2586
2587    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588        In this case:
2589
2590          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592          LOOP_VINFO_PEELING_FOR_NITER == false
2593
2594    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595        to handle the remaining scalar iterations.  In this case:
2596
2597          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598          LOOP_VINFO_PEELING_FOR_NITER == true
2599
2600        There are two choices:
2601
2602        (2a) Consider vectorizing the epilogue loop at the same VF as the
2603             main loop, but using partial vectors instead of full vectors.
2604             In this case:
2605
2606               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2607
2608        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609             In this case:
2610
2611               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2612  */
2613
2614 opt_result
2615 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2616 {
2617   /* Determine whether there would be any scalar iterations left over.  */
2618   bool need_peeling_or_partial_vectors_p
2619     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2620
2621   /* Decide whether to vectorize the loop with partial vectors.  */
2622   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625       && need_peeling_or_partial_vectors_p)
2626     {
2627       /* For partial-vector-usage=1, try to push the handling of partial
2628          vectors to the epilogue, with the main loop continuing to operate
2629          on full vectors.
2630
2631          If we are unrolling we also do not want to use partial vectors. This
2632          is to avoid the overhead of generating multiple masks and also to
2633          avoid having to execute entire iterations of FALSE masked instructions
2634          when dealing with one or less full iterations.
2635
2636          ??? We could then end up failing to use partial vectors if we
2637          decide to peel iterations into a prologue, and if the main loop
2638          then ends up processing fewer than VF iterations.  */
2639       if ((param_vect_partial_vector_usage == 1
2640            || loop_vinfo->suggested_unroll_factor > 1)
2641           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644       else
2645         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2646     }
2647
2648   if (dump_enabled_p ())
2649     dump_printf_loc (MSG_NOTE, vect_location,
2650                      "operating on %s vectors%s.\n",
2651                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652                      ? "partial" : "full",
2653                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654                      ? " for epilogue loop" : "");
2655
2656   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658        && need_peeling_or_partial_vectors_p);
2659
2660   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2661      analysis that we don't know whether the loop is vectorized by partial
2662      vectors (More details see tree-vect-loop-manip.cc).
2663
2664      However, SELECT_VL vectorizaton style should only applied on partial
2665      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2666      number of elements to be process for each iteration.
2667
2668      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2669      if it is not partial vectorized loop.  */
2670   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2671     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2672
2673   return opt_result::success ();
2674 }
2675
2676 /* Function vect_analyze_loop_2.
2677
2678    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2679    analyses will record information in some members of LOOP_VINFO.  FATAL
2680    indicates if some analysis meets fatal error.  If one non-NULL pointer
2681    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2682    worked out suggested unroll factor, while one NULL pointer shows it's
2683    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2684    is to hold the slp decision when the suggested unroll factor is worked
2685    out.  */
2686 static opt_result
2687 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2688                      unsigned *suggested_unroll_factor,
2689                      bool& slp_done_for_suggested_uf)
2690 {
2691   opt_result ok = opt_result::success ();
2692   int res;
2693   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2694   poly_uint64 min_vf = 2;
2695   loop_vec_info orig_loop_vinfo = NULL;
2696
2697   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2698      loop_vec_info of the first vectorized loop.  */
2699   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2700     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2701   else
2702     orig_loop_vinfo = loop_vinfo;
2703   gcc_assert (orig_loop_vinfo);
2704
2705   /* The first group of checks is independent of the vector size.  */
2706   fatal = true;
2707
2708   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2709       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2710     return opt_result::failure_at (vect_location,
2711                                    "not vectorized: simd if(0)\n");
2712
2713   /* Find all data references in the loop (which correspond to vdefs/vuses)
2714      and analyze their evolution in the loop.  */
2715
2716   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2717
2718   /* Gather the data references and count stmts in the loop.  */
2719   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2720     {
2721       opt_result res
2722         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2723                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2724                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2725       if (!res)
2726         {
2727           if (dump_enabled_p ())
2728             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729                              "not vectorized: loop contains function "
2730                              "calls or data references that cannot "
2731                              "be analyzed\n");
2732           return res;
2733         }
2734       loop_vinfo->shared->save_datarefs ();
2735     }
2736   else
2737     loop_vinfo->shared->check_datarefs ();
2738
2739   /* Analyze the data references and also adjust the minimal
2740      vectorization factor according to the loads and stores.  */
2741
2742   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2743   if (!ok)
2744     {
2745       if (dump_enabled_p ())
2746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2747                          "bad data references.\n");
2748       return ok;
2749     }
2750
2751   /* Check if we are applying unroll factor now.  */
2752   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2753   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2754
2755   /* If the slp decision is false when suggested unroll factor is worked
2756      out, and we are applying suggested unroll factor, we can simply skip
2757      all slp related analyses this time.  */
2758   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2759
2760   /* Classify all cross-iteration scalar data-flow cycles.
2761      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2762   vect_analyze_scalar_cycles (loop_vinfo, slp);
2763
2764   vect_pattern_recog (loop_vinfo);
2765
2766   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2767
2768   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2769      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2770
2771   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2772   if (!ok)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "bad data access.\n");
2777       return ok;
2778     }
2779
2780   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2781
2782   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2783   if (!ok)
2784     {
2785       if (dump_enabled_p ())
2786         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787                          "unexpected pattern.\n");
2788       return ok;
2789     }
2790
2791   /* While the rest of the analysis below depends on it in some way.  */
2792   fatal = false;
2793
2794   /* Analyze data dependences between the data-refs in the loop
2795      and adjust the maximum vectorization factor according to
2796      the dependences.
2797      FORNOW: fail at the first data dependence that we encounter.  */
2798
2799   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2800   if (!ok)
2801     {
2802       if (dump_enabled_p ())
2803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804                          "bad data dependence.\n");
2805       return ok;
2806     }
2807   if (max_vf != MAX_VECTORIZATION_FACTOR
2808       && maybe_lt (max_vf, min_vf))
2809     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2810   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2811
2812   ok = vect_determine_vectorization_factor (loop_vinfo);
2813   if (!ok)
2814     {
2815       if (dump_enabled_p ())
2816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817                          "can't determine vectorization factor.\n");
2818       return ok;
2819     }
2820
2821   /* Compute the scalar iteration cost.  */
2822   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2823
2824   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2825
2826   if (slp)
2827     {
2828       /* Check the SLP opportunities in the loop, analyze and build
2829          SLP trees.  */
2830       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2831       if (!ok)
2832         return ok;
2833
2834       /* If there are any SLP instances mark them as pure_slp.  */
2835       slp = vect_make_slp_decision (loop_vinfo);
2836       if (slp)
2837         {
2838           /* Find stmts that need to be both vectorized and SLPed.  */
2839           vect_detect_hybrid_slp (loop_vinfo);
2840
2841           /* Update the vectorization factor based on the SLP decision.  */
2842           vect_update_vf_for_slp (loop_vinfo);
2843
2844           /* Optimize the SLP graph with the vectorization factor fixed.  */
2845           vect_optimize_slp (loop_vinfo);
2846
2847           /* Gather the loads reachable from the SLP graph entries.  */
2848           vect_gather_slp_loads (loop_vinfo);
2849         }
2850     }
2851
2852   bool saved_can_use_partial_vectors_p
2853     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2854
2855   /* We don't expect to have to roll back to anything other than an empty
2856      set of rgroups.  */
2857   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2858
2859   /* This is the point where we can re-start analysis with SLP forced off.  */
2860 start_over:
2861
2862   /* Apply the suggested unrolling factor, this was determined by the backend
2863      during finish_cost the first time we ran the analyzis for this
2864      vector mode.  */
2865   if (applying_suggested_uf)
2866     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2867
2868   /* Now the vectorization factor is final.  */
2869   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2870   gcc_assert (known_ne (vectorization_factor, 0U));
2871
2872   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2873     {
2874       dump_printf_loc (MSG_NOTE, vect_location,
2875                        "vectorization_factor = ");
2876       dump_dec (MSG_NOTE, vectorization_factor);
2877       dump_printf (MSG_NOTE, ", niters = %wd\n",
2878                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2879     }
2880
2881   if (max_vf != MAX_VECTORIZATION_FACTOR
2882       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2883     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2884
2885   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2886
2887   /* Analyze the alignment of the data-refs in the loop.
2888      Fail if a data reference is found that cannot be vectorized.  */
2889
2890   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2891   if (!ok)
2892     {
2893       if (dump_enabled_p ())
2894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2895                          "bad data alignment.\n");
2896       return ok;
2897     }
2898
2899   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2900      It is important to call pruning after vect_analyze_data_ref_accesses,
2901      since we use grouping information gathered by interleaving analysis.  */
2902   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2903   if (!ok)
2904     return ok;
2905
2906   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2907      vectorization, since we do not want to add extra peeling or
2908      add versioning for alignment.  */
2909   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2910     /* This pass will decide on using loop versioning and/or loop peeling in
2911        order to enhance the alignment of data references in the loop.  */
2912     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2913   if (!ok)
2914     return ok;
2915
2916   if (slp)
2917     {
2918       /* Analyze operations in the SLP instances.  Note this may
2919          remove unsupported SLP instances which makes the above
2920          SLP kind detection invalid.  */
2921       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2922       vect_slp_analyze_operations (loop_vinfo);
2923       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2924         {
2925           ok = opt_result::failure_at (vect_location,
2926                                        "unsupported SLP instances\n");
2927           goto again;
2928         }
2929
2930       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2931       slp_tree load_node, slp_root;
2932       unsigned i, x;
2933       slp_instance instance;
2934       bool can_use_lanes = true;
2935       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2936         {
2937           slp_root = SLP_INSTANCE_TREE (instance);
2938           int group_size = SLP_TREE_LANES (slp_root);
2939           tree vectype = SLP_TREE_VECTYPE (slp_root);
2940           bool loads_permuted = false;
2941           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2942             {
2943               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2944                 continue;
2945               unsigned j;
2946               stmt_vec_info load_info;
2947               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2948                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2949                   {
2950                     loads_permuted = true;
2951                     break;
2952                   }
2953             }
2954
2955           /* If the loads and stores can be handled with load/store-lane
2956              instructions record it and move on to the next instance.  */
2957           if (loads_permuted
2958               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2959               && vect_store_lanes_supported (vectype, group_size, false)
2960                    != IFN_LAST)
2961             {
2962               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2963                 if (STMT_VINFO_GROUPED_ACCESS
2964                       (SLP_TREE_REPRESENTATIVE (load_node)))
2965                   {
2966                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2967                         (SLP_TREE_REPRESENTATIVE (load_node));
2968                     /* Use SLP for strided accesses (or if we can't
2969                        load-lanes).  */
2970                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2971                         || vect_load_lanes_supported
2972                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2973                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2974                       break;
2975                   }
2976
2977               can_use_lanes
2978                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2979
2980               if (can_use_lanes && dump_enabled_p ())
2981                 dump_printf_loc (MSG_NOTE, vect_location,
2982                                  "SLP instance %p can use load/store-lanes\n",
2983                                  (void *) instance);
2984             }
2985           else
2986             {
2987               can_use_lanes = false;
2988               break;
2989             }
2990         }
2991
2992       /* If all SLP instances can use load/store-lanes abort SLP and try again
2993          with SLP disabled.  */
2994       if (can_use_lanes)
2995         {
2996           ok = opt_result::failure_at (vect_location,
2997                                        "Built SLP cancelled: can use "
2998                                        "load/store-lanes\n");
2999           if (dump_enabled_p ())
3000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                              "Built SLP cancelled: all SLP instances support "
3002                              "load/store-lanes\n");
3003           goto again;
3004         }
3005     }
3006
3007   /* Dissolve SLP-only groups.  */
3008   vect_dissolve_slp_only_groups (loop_vinfo);
3009
3010   /* Scan all the remaining operations in the loop that are not subject
3011      to SLP and make sure they are vectorizable.  */
3012   ok = vect_analyze_loop_operations (loop_vinfo);
3013   if (!ok)
3014     {
3015       if (dump_enabled_p ())
3016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017                          "bad operation or unsupported loop bound.\n");
3018       return ok;
3019     }
3020
3021   /* For now, we don't expect to mix both masking and length approaches for one
3022      loop, disable it if both are recorded.  */
3023   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3024       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3025       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3026     {
3027       if (dump_enabled_p ())
3028         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029                          "can't vectorize a loop with partial vectors"
3030                          " because we don't expect to mix different"
3031                          " approaches with partial vectors for the"
3032                          " same loop.\n");
3033       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3034     }
3035
3036   /* If we still have the option of using partial vectors,
3037      check whether we can generate the necessary loop controls.  */
3038   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3039     {
3040       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3041         {
3042           if (!vect_verify_full_masking (loop_vinfo)
3043               && !vect_verify_full_masking_avx512 (loop_vinfo))
3044             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3045         }
3046       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3047         if (!vect_verify_loop_lens (loop_vinfo))
3048           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3049     }
3050
3051   /* If we're vectorizing a loop that uses length "controls" and
3052      can iterate more than once, we apply decrementing IV approach
3053      in loop control.  */
3054   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3055       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3056       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3057       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3058            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3059                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3060     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3061
3062   /* If a loop uses length controls and has a decrementing loop control IV,
3063      we will normally pass that IV through a MIN_EXPR to calcaluate the
3064      basis for the length controls.  E.g. in a loop that processes one
3065      element per scalar iteration, the number of elements would be
3066      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3067
3068      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3069      step, since only the final iteration of the vector loop can have
3070      inactive lanes.
3071
3072      However, some targets have a dedicated instruction for calculating the
3073      preferred length, given the total number of elements that still need to
3074      be processed.  This is encapsulated in the SELECT_VL internal function.
3075
3076      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3077      to determine the basis for the length controls.  However, unlike the
3078      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3079      lanes inactive in any iteration of the vector loop, not just the last
3080      iteration.  This SELECT_VL approach therefore requires us to use pointer
3081      IVs with variable steps.
3082
3083      Once we've decided how many elements should be processed by one
3084      iteration of the vector loop, we need to populate the rgroup controls.
3085      If a loop has multiple rgroups, we need to make sure that those rgroups
3086      "line up" (that is, they must be consistent about which elements are
3087      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3088
3089      In principle, it would be possible to use vect_adjust_loop_lens_control
3090      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3091      However:
3092
3093      (1) In practice, it only makes sense to use SELECT_VL when a vector
3094          operation will be controlled directly by the result.  It is not
3095          worth using SELECT_VL if it would only be the input to other
3096          calculations.
3097
3098      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3099          pointer IV will need N updates by a variable amount (N-1 updates
3100          within the iteration and 1 update to move to the next iteration).
3101
3102      Because of this, we prefer to use the MIN_EXPR approach whenever there
3103      is more than one length control.
3104
3105      In addition, SELECT_VL always operates to a granularity of 1 unit.
3106      If we wanted to use it to control an SLP operation on N consecutive
3107      elements, we would need to make the SELECT_VL inputs measure scalar
3108      iterations (rather than elements) and then multiply the SELECT_VL
3109      result by N.  But using SELECT_VL this way is inefficient because
3110      of (1) above.
3111
3112      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3113         satisfied:
3114
3115      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3116      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3117
3118      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3119      we will fail to gain benefits of following unroll optimizations. We prefer
3120      using the MIN_EXPR approach in this situation.  */
3121   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3122     {
3123       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3124       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3125                                           OPTIMIZE_FOR_SPEED)
3126           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3127           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3128           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3129               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3130         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3131     }
3132
3133   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3134      assuming that the loop will be used as a main loop.  We will redo
3135      this analysis later if we instead decide to use the loop as an
3136      epilogue loop.  */
3137   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3138   if (!ok)
3139     return ok;
3140
3141   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3142      to be able to handle fewer than VF scalars, or needs to have a lower VF
3143      than the main loop.  */
3144   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3145       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3146     {
3147       poly_uint64 unscaled_vf
3148         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3149                      orig_loop_vinfo->suggested_unroll_factor);
3150       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3151         return opt_result::failure_at (vect_location,
3152                                        "Vectorization factor too high for"
3153                                        " epilogue loop.\n");
3154     }
3155
3156   /* Check the costings of the loop make vectorizing worthwhile.  */
3157   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3158   if (res < 0)
3159     {
3160       ok = opt_result::failure_at (vect_location,
3161                                    "Loop costings may not be worthwhile.\n");
3162       goto again;
3163     }
3164   if (!res)
3165     return opt_result::failure_at (vect_location,
3166                                    "Loop costings not worthwhile.\n");
3167
3168   /* If an epilogue loop is required make sure we can create one.  */
3169   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3170       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3171     {
3172       if (dump_enabled_p ())
3173         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3174       if (!vect_can_advance_ivs_p (loop_vinfo)
3175           || !slpeel_can_duplicate_loop_p (loop,
3176                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3177                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3178         {
3179           ok = opt_result::failure_at (vect_location,
3180                                        "not vectorized: can't create required "
3181                                        "epilog loop\n");
3182           goto again;
3183         }
3184     }
3185
3186   /* During peeling, we need to check if number of loop iterations is
3187      enough for both peeled prolog loop and vector loop.  This check
3188      can be merged along with threshold check of loop versioning, so
3189      increase threshold for this case if necessary.
3190
3191      If we are analyzing an epilogue we still want to check what its
3192      versioning threshold would be.  If we decide to vectorize the epilogues we
3193      will want to use the lowest versioning threshold of all epilogues and main
3194      loop.  This will enable us to enter a vectorized epilogue even when
3195      versioning the loop.  We can't simply check whether the epilogue requires
3196      versioning though since we may have skipped some versioning checks when
3197      analyzing the epilogue.  For instance, checks for alias versioning will be
3198      skipped when dealing with epilogues as we assume we already checked them
3199      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3200   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3201     {
3202       poly_uint64 niters_th = 0;
3203       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3204
3205       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3206         {
3207           /* Niters for peeled prolog loop.  */
3208           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3209             {
3210               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3211               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3212               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3213             }
3214           else
3215             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3216         }
3217
3218       /* Niters for at least one iteration of vectorized loop.  */
3219       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3220         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3221       /* One additional iteration because of peeling for gap.  */
3222       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3223         niters_th += 1;
3224
3225       /*  Use the same condition as vect_transform_loop to decide when to use
3226           the cost to determine a versioning threshold.  */
3227       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3228           && ordered_p (th, niters_th))
3229         niters_th = ordered_max (poly_uint64 (th), niters_th);
3230
3231       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3232     }
3233
3234   gcc_assert (known_eq (vectorization_factor,
3235                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3236
3237   slp_done_for_suggested_uf = slp;
3238
3239   /* Ok to vectorize!  */
3240   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3241   return opt_result::success ();
3242
3243 again:
3244   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3245   gcc_assert (!ok);
3246
3247   /* Try again with SLP forced off but if we didn't do any SLP there is
3248      no point in re-trying.  */
3249   if (!slp)
3250     return ok;
3251
3252   /* If the slp decision is true when suggested unroll factor is worked
3253      out, and we are applying suggested unroll factor, we don't need to
3254      re-try any more.  */
3255   if (applying_suggested_uf && slp_done_for_suggested_uf)
3256     return ok;
3257
3258   /* If there are reduction chains re-trying will fail anyway.  */
3259   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3260     return ok;
3261
3262   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3263      via interleaving or lane instructions.  */
3264   slp_instance instance;
3265   slp_tree node;
3266   unsigned i, j;
3267   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3268     {
3269       stmt_vec_info vinfo;
3270       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3271       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3272         continue;
3273       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3274       unsigned int size = DR_GROUP_SIZE (vinfo);
3275       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3276       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3277          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3278          && ! vect_grouped_store_supported (vectype, size))
3279         return opt_result::failure_at (vinfo->stmt,
3280                                        "unsupported grouped store\n");
3281       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3282         {
3283           vinfo = SLP_TREE_REPRESENTATIVE (node);
3284           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3285             {
3286               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3287               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3288               size = DR_GROUP_SIZE (vinfo);
3289               vectype = STMT_VINFO_VECTYPE (vinfo);
3290               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3291                   && ! vect_grouped_load_supported (vectype, single_element_p,
3292                                                     size))
3293                 return opt_result::failure_at (vinfo->stmt,
3294                                                "unsupported grouped load\n");
3295             }
3296         }
3297     }
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "re-trying with SLP disabled\n");
3302
3303   /* Roll back state appropriately.  No SLP this time.  */
3304   slp = false;
3305   /* Restore vectorization factor as it were without SLP.  */
3306   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3307   /* Free the SLP instances.  */
3308   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3309     vect_free_slp_instance (instance);
3310   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3311   /* Reset SLP type to loop_vect on all stmts.  */
3312   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3313     {
3314       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3315       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3316            !gsi_end_p (si); gsi_next (&si))
3317         {
3318           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3319           STMT_SLP_TYPE (stmt_info) = loop_vect;
3320           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3321               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3322             {
3323               /* vectorizable_reduction adjusts reduction stmt def-types,
3324                  restore them to that of the PHI.  */
3325               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3326                 = STMT_VINFO_DEF_TYPE (stmt_info);
3327               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3329                 = STMT_VINFO_DEF_TYPE (stmt_info);
3330             }
3331         }
3332       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3333            !gsi_end_p (si); gsi_next (&si))
3334         {
3335           if (is_gimple_debug (gsi_stmt (si)))
3336             continue;
3337           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3338           STMT_SLP_TYPE (stmt_info) = loop_vect;
3339           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3340             {
3341               stmt_vec_info pattern_stmt_info
3342                 = STMT_VINFO_RELATED_STMT (stmt_info);
3343               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3344                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3345
3346               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3347               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3348               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3349                    !gsi_end_p (pi); gsi_next (&pi))
3350                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3351                   = loop_vect;
3352             }
3353         }
3354     }
3355   /* Free optimized alias test DDRS.  */
3356   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3357   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3358   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3359   /* Reset target cost data.  */
3360   delete loop_vinfo->vector_costs;
3361   loop_vinfo->vector_costs = nullptr;
3362   /* Reset accumulated rgroup information.  */
3363   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3364   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3365   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3366   /* Reset assorted flags.  */
3367   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3368   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3369   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3370   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3371   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3372     = saved_can_use_partial_vectors_p;
3373
3374   goto start_over;
3375 }
3376
3377 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3378    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3379    OLD_LOOP_VINFO is better unless something specifically indicates
3380    otherwise.
3381
3382    Note that this deliberately isn't a partial order.  */
3383
3384 static bool
3385 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3386                           loop_vec_info old_loop_vinfo)
3387 {
3388   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3389   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3390
3391   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3392   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3393
3394   /* Always prefer a VF of loop->simdlen over any other VF.  */
3395   if (loop->simdlen)
3396     {
3397       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3398       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3399       if (new_simdlen_p != old_simdlen_p)
3400         return new_simdlen_p;
3401     }
3402
3403   const auto *old_costs = old_loop_vinfo->vector_costs;
3404   const auto *new_costs = new_loop_vinfo->vector_costs;
3405   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3406     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3407
3408   return new_costs->better_main_loop_than_p (old_costs);
3409 }
3410
3411 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3412    true if we should.  */
3413
3414 static bool
3415 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3416                         loop_vec_info old_loop_vinfo)
3417 {
3418   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3419     return false;
3420
3421   if (dump_enabled_p ())
3422     dump_printf_loc (MSG_NOTE, vect_location,
3423                      "***** Preferring vector mode %s to vector mode %s\n",
3424                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3425                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3426   return true;
3427 }
3428
3429 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3430    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3431    MODE_I to the next mode useful to analyze.
3432    Return the loop_vinfo on success and wrapped null on failure.  */
3433
3434 static opt_loop_vec_info
3435 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3436                      const vect_loop_form_info *loop_form_info,
3437                      loop_vec_info main_loop_vinfo,
3438                      const vector_modes &vector_modes, unsigned &mode_i,
3439                      machine_mode &autodetected_vector_mode,
3440                      bool &fatal)
3441 {
3442   loop_vec_info loop_vinfo
3443     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3444
3445   machine_mode vector_mode = vector_modes[mode_i];
3446   loop_vinfo->vector_mode = vector_mode;
3447   unsigned int suggested_unroll_factor = 1;
3448   bool slp_done_for_suggested_uf = false;
3449
3450   /* Run the main analysis.  */
3451   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3452                                         &suggested_unroll_factor,
3453                                         slp_done_for_suggested_uf);
3454   if (dump_enabled_p ())
3455     dump_printf_loc (MSG_NOTE, vect_location,
3456                      "***** Analysis %s with vector mode %s\n",
3457                      res ? "succeeded" : " failed",
3458                      GET_MODE_NAME (loop_vinfo->vector_mode));
3459
3460   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3461     {
3462       if (dump_enabled_p ())
3463         dump_printf_loc (MSG_NOTE, vect_location,
3464                          "***** Re-trying analysis for unrolling"
3465                          " with unroll factor %d and slp %s.\n",
3466                          suggested_unroll_factor,
3467                          slp_done_for_suggested_uf ? "on" : "off");
3468       loop_vec_info unroll_vinfo
3469         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3470       unroll_vinfo->vector_mode = vector_mode;
3471       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3472       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3473                                                 slp_done_for_suggested_uf);
3474       if (new_res)
3475         {
3476           delete loop_vinfo;
3477           loop_vinfo = unroll_vinfo;
3478         }
3479       else
3480         delete unroll_vinfo;
3481     }
3482
3483   /* Remember the autodetected vector mode.  */
3484   if (vector_mode == VOIDmode)
3485     autodetected_vector_mode = loop_vinfo->vector_mode;
3486
3487   /* Advance mode_i, first skipping modes that would result in the
3488      same analysis result.  */
3489   while (mode_i + 1 < vector_modes.length ()
3490          && vect_chooses_same_modes_p (loop_vinfo,
3491                                        vector_modes[mode_i + 1]))
3492     {
3493       if (dump_enabled_p ())
3494         dump_printf_loc (MSG_NOTE, vect_location,
3495                          "***** The result for vector mode %s would"
3496                          " be the same\n",
3497                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3498       mode_i += 1;
3499     }
3500   if (mode_i + 1 < vector_modes.length ()
3501       && VECTOR_MODE_P (autodetected_vector_mode)
3502       && (related_vector_mode (vector_modes[mode_i + 1],
3503                                GET_MODE_INNER (autodetected_vector_mode))
3504           == autodetected_vector_mode)
3505       && (related_vector_mode (autodetected_vector_mode,
3506                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3507           == vector_modes[mode_i + 1]))
3508     {
3509       if (dump_enabled_p ())
3510         dump_printf_loc (MSG_NOTE, vect_location,
3511                          "***** Skipping vector mode %s, which would"
3512                          " repeat the analysis for %s\n",
3513                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3514                          GET_MODE_NAME (autodetected_vector_mode));
3515       mode_i += 1;
3516     }
3517   mode_i++;
3518
3519   if (!res)
3520     {
3521       delete loop_vinfo;
3522       if (fatal)
3523         gcc_checking_assert (main_loop_vinfo == NULL);
3524       return opt_loop_vec_info::propagate_failure (res);
3525     }
3526
3527   return opt_loop_vec_info::success (loop_vinfo);
3528 }
3529
3530 /* Function vect_analyze_loop.
3531
3532    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3533    for it.  The different analyses will record information in the
3534    loop_vec_info struct.  */
3535 opt_loop_vec_info
3536 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3537 {
3538   DUMP_VECT_SCOPE ("analyze_loop_nest");
3539
3540   if (loop_outer (loop)
3541       && loop_vec_info_for_loop (loop_outer (loop))
3542       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3543     return opt_loop_vec_info::failure_at (vect_location,
3544                                           "outer-loop already vectorized.\n");
3545
3546   if (!find_loop_nest (loop, &shared->loop_nest))
3547     return opt_loop_vec_info::failure_at
3548       (vect_location,
3549        "not vectorized: loop nest containing two or more consecutive inner"
3550        " loops cannot be vectorized\n");
3551
3552   /* Analyze the loop form.  */
3553   vect_loop_form_info loop_form_info;
3554   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3555   if (!res)
3556     {
3557       if (dump_enabled_p ())
3558         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559                          "bad loop form.\n");
3560       return opt_loop_vec_info::propagate_failure (res);
3561     }
3562   if (!integer_onep (loop_form_info.assumptions))
3563     {
3564       /* We consider to vectorize this loop by versioning it under
3565          some assumptions.  In order to do this, we need to clear
3566          existing information computed by scev and niter analyzer.  */
3567       scev_reset_htab ();
3568       free_numbers_of_iterations_estimates (loop);
3569       /* Also set flag for this loop so that following scev and niter
3570          analysis are done under the assumptions.  */
3571       loop_constraint_set (loop, LOOP_C_FINITE);
3572     }
3573
3574   auto_vector_modes vector_modes;
3575   /* Autodetect first vector size we try.  */
3576   vector_modes.safe_push (VOIDmode);
3577   unsigned int autovec_flags
3578     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3579                                                     loop->simdlen != 0);
3580   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3581                              && !unlimited_cost_model (loop));
3582   machine_mode autodetected_vector_mode = VOIDmode;
3583   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3584   unsigned int mode_i = 0;
3585   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3586
3587   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3588      a mode has not been analyzed.  */
3589   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3590   for (unsigned i = 0; i < vector_modes.length (); ++i)
3591     cached_vf_per_mode.safe_push (0);
3592
3593   /* First determine the main loop vectorization mode, either the first
3594      one that works, starting with auto-detecting the vector mode and then
3595      following the targets order of preference, or the one with the
3596      lowest cost if pick_lowest_cost_p.  */
3597   while (1)
3598     {
3599       bool fatal;
3600       unsigned int last_mode_i = mode_i;
3601       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3602          failed.  */
3603       cached_vf_per_mode[last_mode_i] = -1;
3604       opt_loop_vec_info loop_vinfo
3605         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3606                                NULL, vector_modes, mode_i,
3607                                autodetected_vector_mode, fatal);
3608       if (fatal)
3609         break;
3610
3611       if (loop_vinfo)
3612         {
3613           /*  Analyzis has been successful so update the VF value.  The
3614               VF should always be a multiple of unroll_factor and we want to
3615               capture the original VF here.  */
3616           cached_vf_per_mode[last_mode_i]
3617             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3618                          loop_vinfo->suggested_unroll_factor);
3619           /* Once we hit the desired simdlen for the first time,
3620              discard any previous attempts.  */
3621           if (simdlen
3622               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3623             {
3624               delete first_loop_vinfo;
3625               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3626               simdlen = 0;
3627             }
3628           else if (pick_lowest_cost_p
3629                    && first_loop_vinfo
3630                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3631             {
3632               /* Pick loop_vinfo over first_loop_vinfo.  */
3633               delete first_loop_vinfo;
3634               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3635             }
3636           if (first_loop_vinfo == NULL)
3637             first_loop_vinfo = loop_vinfo;
3638           else
3639             {
3640               delete loop_vinfo;
3641               loop_vinfo = opt_loop_vec_info::success (NULL);
3642             }
3643
3644           /* Commit to first_loop_vinfo if we have no reason to try
3645              alternatives.  */
3646           if (!simdlen && !pick_lowest_cost_p)
3647             break;
3648         }
3649       if (mode_i == vector_modes.length ()
3650           || autodetected_vector_mode == VOIDmode)
3651         break;
3652
3653       /* Try the next biggest vector size.  */
3654       if (dump_enabled_p ())
3655         dump_printf_loc (MSG_NOTE, vect_location,
3656                          "***** Re-trying analysis with vector mode %s\n",
3657                          GET_MODE_NAME (vector_modes[mode_i]));
3658     }
3659   if (!first_loop_vinfo)
3660     return opt_loop_vec_info::propagate_failure (res);
3661
3662   if (dump_enabled_p ())
3663     dump_printf_loc (MSG_NOTE, vect_location,
3664                      "***** Choosing vector mode %s\n",
3665                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3666
3667   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3668      enabled, SIMDUID is not set, it is the innermost loop and we have
3669      either already found the loop's SIMDLEN or there was no SIMDLEN to
3670      begin with.
3671      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3672   bool vect_epilogues = (!simdlen
3673                          && loop->inner == NULL
3674                          && param_vect_epilogues_nomask
3675                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3676                          && !loop->simduid);
3677   if (!vect_epilogues)
3678     return first_loop_vinfo;
3679
3680   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3681   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3682
3683   /* For epilogues start the analysis from the first mode.  The motivation
3684      behind starting from the beginning comes from cases where the VECTOR_MODES
3685      array may contain length-agnostic and length-specific modes.  Their
3686      ordering is not guaranteed, so we could end up picking a mode for the main
3687      loop that is after the epilogue's optimal mode.  */
3688   vector_modes[0] = autodetected_vector_mode;
3689   mode_i = 0;
3690
3691   bool supports_partial_vectors =
3692     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3693   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3694
3695   while (1)
3696     {
3697       /* If the target does not support partial vectors we can shorten the
3698          number of modes to analyze for the epilogue as we know we can't pick a
3699          mode that would lead to a VF at least as big as the
3700          FIRST_VINFO_VF.  */
3701       if (!supports_partial_vectors
3702           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3703         {
3704           mode_i++;
3705           if (mode_i == vector_modes.length ())
3706             break;
3707           continue;
3708         }
3709
3710       if (dump_enabled_p ())
3711         dump_printf_loc (MSG_NOTE, vect_location,
3712                          "***** Re-trying epilogue analysis with vector "
3713                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3714
3715       bool fatal;
3716       opt_loop_vec_info loop_vinfo
3717         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3718                                first_loop_vinfo,
3719                                vector_modes, mode_i,
3720                                autodetected_vector_mode, fatal);
3721       if (fatal)
3722         break;
3723
3724       if (loop_vinfo)
3725         {
3726           if (pick_lowest_cost_p)
3727             {
3728               /* Keep trying to roll back vectorization attempts while the
3729                  loop_vec_infos they produced were worse than this one.  */
3730               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3731               while (!vinfos.is_empty ()
3732                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3733                 {
3734                   gcc_assert (vect_epilogues);
3735                   delete vinfos.pop ();
3736                 }
3737             }
3738           /* For now only allow one epilogue loop.  */
3739           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3740             {
3741               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3742               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3743               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3744                           || maybe_ne (lowest_th, 0U));
3745               /* Keep track of the known smallest versioning
3746                  threshold.  */
3747               if (ordered_p (lowest_th, th))
3748                 lowest_th = ordered_min (lowest_th, th);
3749             }
3750           else
3751             {
3752               delete loop_vinfo;
3753               loop_vinfo = opt_loop_vec_info::success (NULL);
3754             }
3755
3756           /* For now only allow one epilogue loop, but allow
3757              pick_lowest_cost_p to replace it, so commit to the
3758              first epilogue if we have no reason to try alternatives.  */
3759           if (!pick_lowest_cost_p)
3760             break;
3761         }
3762
3763       if (mode_i == vector_modes.length ())
3764         break;
3765
3766     }
3767
3768   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3769     {
3770       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3771       if (dump_enabled_p ())
3772         dump_printf_loc (MSG_NOTE, vect_location,
3773                          "***** Choosing epilogue vector mode %s\n",
3774                          GET_MODE_NAME
3775                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3776     }
3777
3778   return first_loop_vinfo;
3779 }
3780
3781 /* Return true if there is an in-order reduction function for CODE, storing
3782    it in *REDUC_FN if so.  */
3783
3784 static bool
3785 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3786 {
3787   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3788      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3789      (-0.0) = -0.0.  */
3790   if (code == PLUS_EXPR || code == MINUS_EXPR)
3791     {
3792       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3793       return true;
3794     }
3795   return false;
3796 }
3797
3798 /* Function reduction_fn_for_scalar_code
3799
3800    Input:
3801    CODE - tree_code of a reduction operations.
3802
3803    Output:
3804    REDUC_FN - the corresponding internal function to be used to reduce the
3805       vector of partial results into a single scalar result, or IFN_LAST
3806       if the operation is a supported reduction operation, but does not have
3807       such an internal function.
3808
3809    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3810
3811 bool
3812 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3813 {
3814   if (code.is_tree_code ())
3815     switch (tree_code (code))
3816       {
3817       case MAX_EXPR:
3818         *reduc_fn = IFN_REDUC_MAX;
3819         return true;
3820
3821       case MIN_EXPR:
3822         *reduc_fn = IFN_REDUC_MIN;
3823         return true;
3824
3825       case PLUS_EXPR:
3826         *reduc_fn = IFN_REDUC_PLUS;
3827         return true;
3828
3829       case BIT_AND_EXPR:
3830         *reduc_fn = IFN_REDUC_AND;
3831         return true;
3832
3833       case BIT_IOR_EXPR:
3834         *reduc_fn = IFN_REDUC_IOR;
3835         return true;
3836
3837       case BIT_XOR_EXPR:
3838         *reduc_fn = IFN_REDUC_XOR;
3839         return true;
3840
3841       case MULT_EXPR:
3842       case MINUS_EXPR:
3843         *reduc_fn = IFN_LAST;
3844         return true;
3845
3846       default:
3847         return false;
3848       }
3849   else
3850     switch (combined_fn (code))
3851       {
3852       CASE_CFN_FMAX:
3853         *reduc_fn = IFN_REDUC_FMAX;
3854         return true;
3855
3856       CASE_CFN_FMIN:
3857         *reduc_fn = IFN_REDUC_FMIN;
3858         return true;
3859
3860       default:
3861         return false;
3862       }
3863 }
3864
3865 /* If there is a neutral value X such that a reduction would not be affected
3866    by the introduction of additional X elements, return that X, otherwise
3867    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3868    of the scalar elements.  If the reduction has just a single initial value
3869    then INITIAL_VALUE is that value, otherwise it is null.
3870    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3871    In that case no signed zero is returned.  */
3872
3873 tree
3874 neutral_op_for_reduction (tree scalar_type, code_helper code,
3875                           tree initial_value, bool as_initial)
3876 {
3877   if (code.is_tree_code ())
3878     switch (tree_code (code))
3879       {
3880       case DOT_PROD_EXPR:
3881       case SAD_EXPR:
3882       case MINUS_EXPR:
3883       case BIT_IOR_EXPR:
3884       case BIT_XOR_EXPR:
3885         return build_zero_cst (scalar_type);
3886       case WIDEN_SUM_EXPR:
3887       case PLUS_EXPR:
3888         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3889           return build_real (scalar_type, dconstm0);
3890         else
3891           return build_zero_cst (scalar_type);
3892
3893       case MULT_EXPR:
3894         return build_one_cst (scalar_type);
3895
3896       case BIT_AND_EXPR:
3897         return build_all_ones_cst (scalar_type);
3898
3899       case MAX_EXPR:
3900       case MIN_EXPR:
3901         return initial_value;
3902
3903       default:
3904         return NULL_TREE;
3905       }
3906   else
3907     switch (combined_fn (code))
3908       {
3909       CASE_CFN_FMIN:
3910       CASE_CFN_FMAX:
3911         return initial_value;
3912
3913       default:
3914         return NULL_TREE;
3915       }
3916 }
3917
3918 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3919    STMT is printed with a message MSG. */
3920
3921 static void
3922 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3923 {
3924   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3925 }
3926
3927 /* Return true if we need an in-order reduction for operation CODE
3928    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3929    overflow must wrap.  */
3930
3931 bool
3932 needs_fold_left_reduction_p (tree type, code_helper code)
3933 {
3934   /* CHECKME: check for !flag_finite_math_only too?  */
3935   if (SCALAR_FLOAT_TYPE_P (type))
3936     {
3937       if (code.is_tree_code ())
3938         switch (tree_code (code))
3939           {
3940           case MIN_EXPR:
3941           case MAX_EXPR:
3942             return false;
3943
3944           default:
3945             return !flag_associative_math;
3946           }
3947       else
3948         switch (combined_fn (code))
3949           {
3950           CASE_CFN_FMIN:
3951           CASE_CFN_FMAX:
3952             return false;
3953
3954           default:
3955             return !flag_associative_math;
3956           }
3957     }
3958
3959   if (INTEGRAL_TYPE_P (type))
3960     return (!code.is_tree_code ()
3961             || !operation_no_trapping_overflow (type, tree_code (code)));
3962
3963   if (SAT_FIXED_POINT_TYPE_P (type))
3964     return true;
3965
3966   return false;
3967 }
3968
3969 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3970    has a handled computation expression.  Store the main reduction
3971    operation in *CODE.  */
3972
3973 static bool
3974 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3975                       tree loop_arg, code_helper *code,
3976                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3977 {
3978   auto_bitmap visited;
3979   tree lookfor = PHI_RESULT (phi);
3980   ssa_op_iter curri;
3981   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3982   while (USE_FROM_PTR (curr) != loop_arg)
3983     curr = op_iter_next_use (&curri);
3984   curri.i = curri.numops;
3985   do
3986     {
3987       path.safe_push (std::make_pair (curri, curr));
3988       tree use = USE_FROM_PTR (curr);
3989       if (use == lookfor)
3990         break;
3991       gimple *def = SSA_NAME_DEF_STMT (use);
3992       if (gimple_nop_p (def)
3993           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3994         {
3995 pop:
3996           do
3997             {
3998               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3999               curri = x.first;
4000               curr = x.second;
4001               do
4002                 curr = op_iter_next_use (&curri);
4003               /* Skip already visited or non-SSA operands (from iterating
4004                  over PHI args).  */
4005               while (curr != NULL_USE_OPERAND_P
4006                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4007                          || ! bitmap_set_bit (visited,
4008                                               SSA_NAME_VERSION
4009                                                 (USE_FROM_PTR (curr)))));
4010             }
4011           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4012           if (curr == NULL_USE_OPERAND_P)
4013             break;
4014         }
4015       else
4016         {
4017           if (gimple_code (def) == GIMPLE_PHI)
4018             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4019           else
4020             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4021           while (curr != NULL_USE_OPERAND_P
4022                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4023                      || ! bitmap_set_bit (visited,
4024                                           SSA_NAME_VERSION
4025                                             (USE_FROM_PTR (curr)))))
4026             curr = op_iter_next_use (&curri);
4027           if (curr == NULL_USE_OPERAND_P)
4028             goto pop;
4029         }
4030     }
4031   while (1);
4032   if (dump_file && (dump_flags & TDF_DETAILS))
4033     {
4034       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4035       unsigned i;
4036       std::pair<ssa_op_iter, use_operand_p> *x;
4037       FOR_EACH_VEC_ELT (path, i, x)
4038         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4039       dump_printf (MSG_NOTE, "\n");
4040     }
4041
4042   /* Check whether the reduction path detected is valid.  */
4043   bool fail = path.length () == 0;
4044   bool neg = false;
4045   int sign = -1;
4046   *code = ERROR_MARK;
4047   for (unsigned i = 1; i < path.length (); ++i)
4048     {
4049       gimple *use_stmt = USE_STMT (path[i].second);
4050       gimple_match_op op;
4051       if (!gimple_extract_op (use_stmt, &op))
4052         {
4053           fail = true;
4054           break;
4055         }
4056       unsigned int opi = op.num_ops;
4057       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4058         {
4059           /* The following make sure we can compute the operand index
4060              easily plus it mostly disallows chaining via COND_EXPR condition
4061              operands.  */
4062           for (opi = 0; opi < op.num_ops; ++opi)
4063             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4064               break;
4065         }
4066       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4067         {
4068           for (opi = 0; opi < op.num_ops; ++opi)
4069             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4070               break;
4071         }
4072       if (opi == op.num_ops)
4073         {
4074           fail = true;
4075           break;
4076         }
4077       op.code = canonicalize_code (op.code, op.type);
4078       if (op.code == MINUS_EXPR)
4079         {
4080           op.code = PLUS_EXPR;
4081           /* Track whether we negate the reduction value each iteration.  */
4082           if (op.ops[1] == op.ops[opi])
4083             neg = ! neg;
4084         }
4085       if (CONVERT_EXPR_CODE_P (op.code)
4086           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4087         ;
4088       else if (*code == ERROR_MARK)
4089         {
4090           *code = op.code;
4091           sign = TYPE_SIGN (op.type);
4092         }
4093       else if (op.code != *code)
4094         {
4095           fail = true;
4096           break;
4097         }
4098       else if ((op.code == MIN_EXPR
4099                 || op.code == MAX_EXPR)
4100                && sign != TYPE_SIGN (op.type))
4101         {
4102           fail = true;
4103           break;
4104         }
4105       /* Check there's only a single stmt the op is used on.  For the
4106          not value-changing tail and the last stmt allow out-of-loop uses.
4107          ???  We could relax this and handle arbitrary live stmts by
4108          forcing a scalar epilogue for example.  */
4109       imm_use_iterator imm_iter;
4110       use_operand_p use_p;
4111       gimple *op_use_stmt;
4112       unsigned cnt = 0;
4113       bool cond_fn_p = op.code.is_internal_fn ()
4114         && (conditional_internal_fn_code (internal_fn (op.code))
4115             != ERROR_MARK);
4116
4117       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4118         {
4119         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4120            op1 twice (once as definition, once as else) in the same operation.
4121            Allow this.  */
4122           if (cond_fn_p && op_use_stmt == use_stmt)
4123             {
4124               gcall *call = as_a<gcall *> (use_stmt);
4125               unsigned else_pos
4126                 = internal_fn_else_index (internal_fn (op.code));
4127
4128               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4129                 {
4130                   if (j == else_pos)
4131                     continue;
4132                   if (gimple_call_arg (call, j) == op.ops[opi])
4133                     cnt++;
4134                 }
4135             }
4136           else if (!is_gimple_debug (op_use_stmt)
4137                    && (*code != ERROR_MARK
4138                        || flow_bb_inside_loop_p (loop,
4139                                                  gimple_bb (op_use_stmt))))
4140             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4141               cnt++;
4142         }
4143
4144       if (cnt != 1)
4145         {
4146           fail = true;
4147           break;
4148         }
4149     }
4150   return ! fail && ! neg && *code != ERROR_MARK;
4151 }
4152
4153 bool
4154 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4155                       tree loop_arg, enum tree_code code)
4156 {
4157   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4158   code_helper code_;
4159   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4160           && code_ == code);
4161 }
4162
4163
4164
4165 /* Function vect_is_simple_reduction
4166
4167    (1) Detect a cross-iteration def-use cycle that represents a simple
4168    reduction computation.  We look for the following pattern:
4169
4170    loop_header:
4171      a1 = phi < a0, a2 >
4172      a3 = ...
4173      a2 = operation (a3, a1)
4174
4175    or
4176
4177    a3 = ...
4178    loop_header:
4179      a1 = phi < a0, a2 >
4180      a2 = operation (a3, a1)
4181
4182    such that:
4183    1. operation is commutative and associative and it is safe to
4184       change the order of the computation
4185    2. no uses for a2 in the loop (a2 is used out of the loop)
4186    3. no uses of a1 in the loop besides the reduction operation
4187    4. no uses of a1 outside the loop.
4188
4189    Conditions 1,4 are tested here.
4190    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4191
4192    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4193    nested cycles.
4194
4195    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4196    reductions:
4197
4198      a1 = phi < a0, a2 >
4199      inner loop (def of a3)
4200      a2 = phi < a3 >
4201
4202    (4) Detect condition expressions, ie:
4203      for (int i = 0; i < N; i++)
4204        if (a[i] < val)
4205         ret_val = a[i];
4206
4207 */
4208
4209 static stmt_vec_info
4210 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4211                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4212 {
4213   gphi *phi = as_a <gphi *> (phi_info->stmt);
4214   gimple *phi_use_stmt = NULL;
4215   imm_use_iterator imm_iter;
4216   use_operand_p use_p;
4217
4218   *double_reduc = false;
4219   *reduc_chain_p = false;
4220   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4221
4222   tree phi_name = PHI_RESULT (phi);
4223   /* ???  If there are no uses of the PHI result the inner loop reduction
4224      won't be detected as possibly double-reduction by vectorizable_reduction
4225      because that tries to walk the PHI arg from the preheader edge which
4226      can be constant.  See PR60382.  */
4227   if (has_zero_uses (phi_name))
4228     return NULL;
4229   class loop *loop = (gimple_bb (phi))->loop_father;
4230   unsigned nphi_def_loop_uses = 0;
4231   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4232     {
4233       gimple *use_stmt = USE_STMT (use_p);
4234       if (is_gimple_debug (use_stmt))
4235         continue;
4236
4237       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4238         {
4239           if (dump_enabled_p ())
4240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4241                              "intermediate value used outside loop.\n");
4242
4243           return NULL;
4244         }
4245
4246       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4247          op1 twice (once as definition, once as else) in the same operation.
4248          Only count it as one. */
4249       if (use_stmt != phi_use_stmt)
4250         {
4251           nphi_def_loop_uses++;
4252           phi_use_stmt = use_stmt;
4253         }
4254     }
4255
4256   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4257   if (TREE_CODE (latch_def) != SSA_NAME)
4258     {
4259       if (dump_enabled_p ())
4260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4261                          "reduction: not ssa_name: %T\n", latch_def);
4262       return NULL;
4263     }
4264
4265   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4266   if (!def_stmt_info
4267       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4268     return NULL;
4269
4270   bool nested_in_vect_loop
4271     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4272   unsigned nlatch_def_loop_uses = 0;
4273   auto_vec<gphi *, 3> lcphis;
4274   bool inner_loop_of_double_reduc = false;
4275   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4276     {
4277       gimple *use_stmt = USE_STMT (use_p);
4278       if (is_gimple_debug (use_stmt))
4279         continue;
4280       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4281         nlatch_def_loop_uses++;
4282       else
4283         {
4284           /* We can have more than one loop-closed PHI.  */
4285           lcphis.safe_push (as_a <gphi *> (use_stmt));
4286           if (nested_in_vect_loop
4287               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4288                   == vect_double_reduction_def))
4289             inner_loop_of_double_reduc = true;
4290         }
4291     }
4292
4293   /* If we are vectorizing an inner reduction we are executing that
4294      in the original order only in case we are not dealing with a
4295      double reduction.  */
4296   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4297     {
4298       if (dump_enabled_p ())
4299         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4300                         "detected nested cycle: ");
4301       return def_stmt_info;
4302     }
4303
4304   /* When the inner loop of a double reduction ends up with more than
4305      one loop-closed PHI we have failed to classify alternate such
4306      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4307   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4308     {
4309       if (dump_enabled_p ())
4310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4311                          "unhandle double reduction\n");
4312       return NULL;
4313     }
4314
4315   /* If this isn't a nested cycle or if the nested cycle reduction value
4316      is used ouside of the inner loop we cannot handle uses of the reduction
4317      value.  */
4318   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4319     {
4320       if (dump_enabled_p ())
4321         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4322                          "reduction used in loop.\n");
4323       return NULL;
4324     }
4325
4326   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4327      defined in the inner loop.  */
4328   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4329     {
4330       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4331       if (gimple_phi_num_args (def_stmt) != 1
4332           || TREE_CODE (op1) != SSA_NAME)
4333         {
4334           if (dump_enabled_p ())
4335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4336                              "unsupported phi node definition.\n");
4337
4338           return NULL;
4339         }
4340
4341       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4342          and the latch definition op1.  */
4343       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4344       if (gimple_bb (def1)
4345           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4346           && loop->inner
4347           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4348           && (is_gimple_assign (def1) || is_gimple_call (def1))
4349           && is_a <gphi *> (phi_use_stmt)
4350           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4351           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4352                                             loop_latch_edge (loop->inner))))
4353         {
4354           if (dump_enabled_p ())
4355             report_vect_op (MSG_NOTE, def_stmt,
4356                             "detected double reduction: ");
4357
4358           *double_reduc = true;
4359           return def_stmt_info;
4360         }
4361
4362       return NULL;
4363     }
4364
4365   /* Look for the expression computing latch_def from then loop PHI result.  */
4366   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4367   code_helper code;
4368   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4369                             path))
4370     {
4371       STMT_VINFO_REDUC_CODE (phi_info) = code;
4372       if (code == COND_EXPR && !nested_in_vect_loop)
4373         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4374
4375       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4376          reduction chain for which the additional restriction is that
4377          all operations in the chain are the same.  */
4378       auto_vec<stmt_vec_info, 8> reduc_chain;
4379       unsigned i;
4380       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4381       for (i = path.length () - 1; i >= 1; --i)
4382         {
4383           gimple *stmt = USE_STMT (path[i].second);
4384           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4385           gimple_match_op op;
4386           if (!gimple_extract_op (stmt, &op))
4387             gcc_unreachable ();
4388           if (gassign *assign = dyn_cast<gassign *> (stmt))
4389             STMT_VINFO_REDUC_IDX (stmt_info)
4390               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4391           else
4392             {
4393               gcall *call = as_a<gcall *> (stmt);
4394               STMT_VINFO_REDUC_IDX (stmt_info)
4395                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4396             }
4397           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4398                                      && (i == 1 || i == path.length () - 1));
4399           if ((op.code != code && !leading_conversion)
4400               /* We can only handle the final value in epilogue
4401                  generation for reduction chains.  */
4402               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4403             is_slp_reduc = false;
4404           /* For reduction chains we support a trailing/leading
4405              conversions.  We do not store those in the actual chain.  */
4406           if (leading_conversion)
4407             continue;
4408           reduc_chain.safe_push (stmt_info);
4409         }
4410       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4411         {
4412           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4413             {
4414               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4415               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4416             }
4417           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4418           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4419
4420           /* Save the chain for further analysis in SLP detection.  */
4421           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4422           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4423
4424           *reduc_chain_p = true;
4425           if (dump_enabled_p ())
4426             dump_printf_loc (MSG_NOTE, vect_location,
4427                             "reduction: detected reduction chain\n");
4428         }
4429       else if (dump_enabled_p ())
4430         dump_printf_loc (MSG_NOTE, vect_location,
4431                          "reduction: detected reduction\n");
4432
4433       return def_stmt_info;
4434     }
4435
4436   if (dump_enabled_p ())
4437     dump_printf_loc (MSG_NOTE, vect_location,
4438                      "reduction: unknown pattern\n");
4439
4440   return NULL;
4441 }
4442
4443 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4444    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4445    or -1 if not known.  */
4446
4447 static int
4448 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4449 {
4450   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4451   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4452     {
4453       if (dump_enabled_p ())
4454         dump_printf_loc (MSG_NOTE, vect_location,
4455                          "cost model: epilogue peel iters set to vf/2 "
4456                          "because loop iterations are unknown .\n");
4457       return assumed_vf / 2;
4458     }
4459   else
4460     {
4461       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4462       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4463       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4464       /* If we need to peel for gaps, but no peeling is required, we have to
4465          peel VF iterations.  */
4466       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4467         peel_iters_epilogue = assumed_vf;
4468       return peel_iters_epilogue;
4469     }
4470 }
4471
4472 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4473 int
4474 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4475                              int *peel_iters_epilogue,
4476                              stmt_vector_for_cost *scalar_cost_vec,
4477                              stmt_vector_for_cost *prologue_cost_vec,
4478                              stmt_vector_for_cost *epilogue_cost_vec)
4479 {
4480   int retval = 0;
4481
4482   *peel_iters_epilogue
4483     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4484
4485   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4486     {
4487       /* If peeled iterations are known but number of scalar loop
4488          iterations are unknown, count a taken branch per peeled loop.  */
4489       if (peel_iters_prologue > 0)
4490         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4491                                    vect_prologue);
4492       if (*peel_iters_epilogue > 0)
4493         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4494                                     vect_epilogue);
4495     }
4496
4497   stmt_info_for_cost *si;
4498   int j;
4499   if (peel_iters_prologue)
4500     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4501       retval += record_stmt_cost (prologue_cost_vec,
4502                                   si->count * peel_iters_prologue,
4503                                   si->kind, si->stmt_info, si->misalign,
4504                                   vect_prologue);
4505   if (*peel_iters_epilogue)
4506     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4507       retval += record_stmt_cost (epilogue_cost_vec,
4508                                   si->count * *peel_iters_epilogue,
4509                                   si->kind, si->stmt_info, si->misalign,
4510                                   vect_epilogue);
4511
4512   return retval;
4513 }
4514
4515 /* Function vect_estimate_min_profitable_iters
4516
4517    Return the number of iterations required for the vector version of the
4518    loop to be profitable relative to the cost of the scalar version of the
4519    loop.
4520
4521    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4522    of iterations for vectorization.  -1 value means loop vectorization
4523    is not profitable.  This returned value may be used for dynamic
4524    profitability check.
4525
4526    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4527    for static check against estimated number of iterations.  */
4528
4529 static void
4530 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4531                                     int *ret_min_profitable_niters,
4532                                     int *ret_min_profitable_estimate,
4533                                     unsigned *suggested_unroll_factor)
4534 {
4535   int min_profitable_iters;
4536   int min_profitable_estimate;
4537   int peel_iters_prologue;
4538   int peel_iters_epilogue;
4539   unsigned vec_inside_cost = 0;
4540   int vec_outside_cost = 0;
4541   unsigned vec_prologue_cost = 0;
4542   unsigned vec_epilogue_cost = 0;
4543   int scalar_single_iter_cost = 0;
4544   int scalar_outside_cost = 0;
4545   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4546   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4547   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4548
4549   /* Cost model disabled.  */
4550   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4551     {
4552       if (dump_enabled_p ())
4553         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4554       *ret_min_profitable_niters = 0;
4555       *ret_min_profitable_estimate = 0;
4556       return;
4557     }
4558
4559   /* Requires loop versioning tests to handle misalignment.  */
4560   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4561     {
4562       /*  FIXME: Make cost depend on complexity of individual check.  */
4563       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4564       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4565       if (dump_enabled_p ())
4566         dump_printf (MSG_NOTE,
4567                      "cost model: Adding cost of checks for loop "
4568                      "versioning to treat misalignment.\n");
4569     }
4570
4571   /* Requires loop versioning with alias checks.  */
4572   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4573     {
4574       /*  FIXME: Make cost depend on complexity of individual check.  */
4575       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4576       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4577       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4578       if (len)
4579         /* Count LEN - 1 ANDs and LEN comparisons.  */
4580         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4581                               scalar_stmt, vect_prologue);
4582       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4583       if (len)
4584         {
4585           /* Count LEN - 1 ANDs and LEN comparisons.  */
4586           unsigned int nstmts = len * 2 - 1;
4587           /* +1 for each bias that needs adding.  */
4588           for (unsigned int i = 0; i < len; ++i)
4589             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4590               nstmts += 1;
4591           (void) add_stmt_cost (target_cost_data, nstmts,
4592                                 scalar_stmt, vect_prologue);
4593         }
4594       if (dump_enabled_p ())
4595         dump_printf (MSG_NOTE,
4596                      "cost model: Adding cost of checks for loop "
4597                      "versioning aliasing.\n");
4598     }
4599
4600   /* Requires loop versioning with niter checks.  */
4601   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4602     {
4603       /*  FIXME: Make cost depend on complexity of individual check.  */
4604       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4605                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4606       if (dump_enabled_p ())
4607         dump_printf (MSG_NOTE,
4608                      "cost model: Adding cost of checks for loop "
4609                      "versioning niters.\n");
4610     }
4611
4612   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4613     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4614                           vect_prologue);
4615
4616   /* Count statements in scalar loop.  Using this as scalar cost for a single
4617      iteration for now.
4618
4619      TODO: Add outer loop support.
4620
4621      TODO: Consider assigning different costs to different scalar
4622      statements.  */
4623
4624   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4625
4626   /* Add additional cost for the peeled instructions in prologue and epilogue
4627      loop.  (For fully-masked loops there will be no peeling.)
4628
4629      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4630      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4631
4632      TODO: Build an expression that represents peel_iters for prologue and
4633      epilogue to be used in a run-time test.  */
4634
4635   bool prologue_need_br_taken_cost = false;
4636   bool prologue_need_br_not_taken_cost = false;
4637
4638   /* Calculate peel_iters_prologue.  */
4639   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4640     peel_iters_prologue = 0;
4641   else if (npeel < 0)
4642     {
4643       peel_iters_prologue = assumed_vf / 2;
4644       if (dump_enabled_p ())
4645         dump_printf (MSG_NOTE, "cost model: "
4646                      "prologue peel iters set to vf/2.\n");
4647
4648       /* If peeled iterations are unknown, count a taken branch and a not taken
4649          branch per peeled loop.  Even if scalar loop iterations are known,
4650          vector iterations are not known since peeled prologue iterations are
4651          not known.  Hence guards remain the same.  */
4652       prologue_need_br_taken_cost = true;
4653       prologue_need_br_not_taken_cost = true;
4654     }
4655   else
4656     {
4657       peel_iters_prologue = npeel;
4658       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4659         /* If peeled iterations are known but number of scalar loop
4660            iterations are unknown, count a taken branch per peeled loop.  */
4661         prologue_need_br_taken_cost = true;
4662     }
4663
4664   bool epilogue_need_br_taken_cost = false;
4665   bool epilogue_need_br_not_taken_cost = false;
4666
4667   /* Calculate peel_iters_epilogue.  */
4668   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4669     /* We need to peel exactly one iteration for gaps.  */
4670     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4671   else if (npeel < 0)
4672     {
4673       /* If peeling for alignment is unknown, loop bound of main loop
4674          becomes unknown.  */
4675       peel_iters_epilogue = assumed_vf / 2;
4676       if (dump_enabled_p ())
4677         dump_printf (MSG_NOTE, "cost model: "
4678                      "epilogue peel iters set to vf/2 because "
4679                      "peeling for alignment is unknown.\n");
4680
4681       /* See the same reason above in peel_iters_prologue calculation.  */
4682       epilogue_need_br_taken_cost = true;
4683       epilogue_need_br_not_taken_cost = true;
4684     }
4685   else
4686     {
4687       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4688       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4689         /* If peeled iterations are known but number of scalar loop
4690            iterations are unknown, count a taken branch per peeled loop.  */
4691         epilogue_need_br_taken_cost = true;
4692     }
4693
4694   stmt_info_for_cost *si;
4695   int j;
4696   /* Add costs associated with peel_iters_prologue.  */
4697   if (peel_iters_prologue)
4698     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4699       {
4700         (void) add_stmt_cost (target_cost_data,
4701                               si->count * peel_iters_prologue, si->kind,
4702                               si->stmt_info, si->node, si->vectype,
4703                               si->misalign, vect_prologue);
4704       }
4705
4706   /* Add costs associated with peel_iters_epilogue.  */
4707   if (peel_iters_epilogue)
4708     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4709       {
4710         (void) add_stmt_cost (target_cost_data,
4711                               si->count * peel_iters_epilogue, si->kind,
4712                               si->stmt_info, si->node, si->vectype,
4713                               si->misalign, vect_epilogue);
4714       }
4715
4716   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4717
4718   if (prologue_need_br_taken_cost)
4719     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4720                           vect_prologue);
4721
4722   if (prologue_need_br_not_taken_cost)
4723     (void) add_stmt_cost (target_cost_data, 1,
4724                           cond_branch_not_taken, vect_prologue);
4725
4726   if (epilogue_need_br_taken_cost)
4727     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4728                           vect_epilogue);
4729
4730   if (epilogue_need_br_not_taken_cost)
4731     (void) add_stmt_cost (target_cost_data, 1,
4732                           cond_branch_not_taken, vect_epilogue);
4733
4734   /* Take care of special costs for rgroup controls of partial vectors.  */
4735   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4736       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4737           == vect_partial_vectors_avx512))
4738     {
4739       /* Calculate how many masks we need to generate.  */
4740       unsigned int num_masks = 0;
4741       bool need_saturation = false;
4742       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4743         if (rgm.type)
4744           {
4745             unsigned nvectors = rgm.factor;
4746             num_masks += nvectors;
4747             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4748                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4749               need_saturation = true;
4750           }
4751
4752       /* ???  The target isn't able to identify the costs below as
4753          producing masks so it cannot penaltize cases where we'd run
4754          out of mask registers for example.  */
4755
4756       /* ???  We are also failing to account for smaller vector masks
4757          we generate by splitting larger masks in vect_get_loop_mask.  */
4758
4759       /* In the worst case, we need to generate each mask in the prologue
4760          and in the loop body.  We need one splat per group and one
4761          compare per mask.
4762
4763          Sometimes the prologue mask will fold to a constant,
4764          so the actual prologue cost might be smaller.  However, it's
4765          simpler and safer to use the worst-case cost; if this ends up
4766          being the tie-breaker between vectorizing or not, then it's
4767          probably better not to vectorize.  */
4768       (void) add_stmt_cost (target_cost_data,
4769                             num_masks
4770                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4771                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4772                             vect_prologue);
4773       (void) add_stmt_cost (target_cost_data,
4774                             num_masks
4775                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4776                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4777
4778       /* When we need saturation we need it both in the prologue and
4779          the epilogue.  */
4780       if (need_saturation)
4781         {
4782           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4783                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4784           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4785                                 NULL, NULL, NULL_TREE, 0, vect_body);
4786         }
4787     }
4788   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4789            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4790                == vect_partial_vectors_while_ult))
4791     {
4792       /* Calculate how many masks we need to generate.  */
4793       unsigned int num_masks = 0;
4794       rgroup_controls *rgm;
4795       unsigned int num_vectors_m1;
4796       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4797                         num_vectors_m1, rgm)
4798         if (rgm->type)
4799           num_masks += num_vectors_m1 + 1;
4800       gcc_assert (num_masks > 0);
4801
4802       /* In the worst case, we need to generate each mask in the prologue
4803          and in the loop body.  One of the loop body mask instructions
4804          replaces the comparison in the scalar loop, and since we don't
4805          count the scalar comparison against the scalar body, we shouldn't
4806          count that vector instruction against the vector body either.
4807
4808          Sometimes we can use unpacks instead of generating prologue
4809          masks and sometimes the prologue mask will fold to a constant,
4810          so the actual prologue cost might be smaller.  However, it's
4811          simpler and safer to use the worst-case cost; if this ends up
4812          being the tie-breaker between vectorizing or not, then it's
4813          probably better not to vectorize.  */
4814       (void) add_stmt_cost (target_cost_data, num_masks,
4815                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4816                             vect_prologue);
4817       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4818                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4819                             vect_body);
4820     }
4821   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4822     {
4823       /* Referring to the functions vect_set_loop_condition_partial_vectors
4824          and vect_set_loop_controls_directly, we need to generate each
4825          length in the prologue and in the loop body if required. Although
4826          there are some possible optimizations, we consider the worst case
4827          here.  */
4828
4829       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4830       signed char partial_load_store_bias
4831         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4832       bool need_iterate_p
4833         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4834            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4835
4836       /* Calculate how many statements to be added.  */
4837       unsigned int prologue_stmts = 0;
4838       unsigned int body_stmts = 0;
4839
4840       rgroup_controls *rgc;
4841       unsigned int num_vectors_m1;
4842       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4843         if (rgc->type)
4844           {
4845             /* May need one SHIFT for nitems_total computation.  */
4846             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4847             if (nitems != 1 && !niters_known_p)
4848               prologue_stmts += 1;
4849
4850             /* May need one MAX and one MINUS for wrap around.  */
4851             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4852               prologue_stmts += 2;
4853
4854             /* Need one MAX and one MINUS for each batch limit excepting for
4855                the 1st one.  */
4856             prologue_stmts += num_vectors_m1 * 2;
4857
4858             unsigned int num_vectors = num_vectors_m1 + 1;
4859
4860             /* Need to set up lengths in prologue, only one MIN required
4861                for each since start index is zero.  */
4862             prologue_stmts += num_vectors;
4863
4864             /* If we have a non-zero partial load bias, we need one PLUS
4865                to adjust the load length.  */
4866             if (partial_load_store_bias != 0)
4867               body_stmts += 1;
4868
4869             /* Each may need two MINs and one MINUS to update lengths in body
4870                for next iteration.  */
4871             if (need_iterate_p)
4872               body_stmts += 3 * num_vectors;
4873           }
4874
4875       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4876                             scalar_stmt, vect_prologue);
4877       (void) add_stmt_cost (target_cost_data, body_stmts,
4878                             scalar_stmt, vect_body);
4879     }
4880
4881   /* FORNOW: The scalar outside cost is incremented in one of the
4882      following ways:
4883
4884      1. The vectorizer checks for alignment and aliasing and generates
4885      a condition that allows dynamic vectorization.  A cost model
4886      check is ANDED with the versioning condition.  Hence scalar code
4887      path now has the added cost of the versioning check.
4888
4889        if (cost > th & versioning_check)
4890          jmp to vector code
4891
4892      Hence run-time scalar is incremented by not-taken branch cost.
4893
4894      2. The vectorizer then checks if a prologue is required.  If the
4895      cost model check was not done before during versioning, it has to
4896      be done before the prologue check.
4897
4898        if (cost <= th)
4899          prologue = scalar_iters
4900        if (prologue == 0)
4901          jmp to vector code
4902        else
4903          execute prologue
4904        if (prologue == num_iters)
4905          go to exit
4906
4907      Hence the run-time scalar cost is incremented by a taken branch,
4908      plus a not-taken branch, plus a taken branch cost.
4909
4910      3. The vectorizer then checks if an epilogue is required.  If the
4911      cost model check was not done before during prologue check, it
4912      has to be done with the epilogue check.
4913
4914        if (prologue == 0)
4915          jmp to vector code
4916        else
4917          execute prologue
4918        if (prologue == num_iters)
4919          go to exit
4920        vector code:
4921          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4922            jmp to epilogue
4923
4924      Hence the run-time scalar cost should be incremented by 2 taken
4925      branches.
4926
4927      TODO: The back end may reorder the BBS's differently and reverse
4928      conditions/branch directions.  Change the estimates below to
4929      something more reasonable.  */
4930
4931   /* If the number of iterations is known and we do not do versioning, we can
4932      decide whether to vectorize at compile time.  Hence the scalar version
4933      do not carry cost model guard costs.  */
4934   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4935       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4936     {
4937       /* Cost model check occurs at versioning.  */
4938       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4939         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4940       else
4941         {
4942           /* Cost model check occurs at prologue generation.  */
4943           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4944             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4945               + vect_get_stmt_cost (cond_branch_not_taken);
4946           /* Cost model check occurs at epilogue generation.  */
4947           else
4948             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4949         }
4950     }
4951
4952   /* Complete the target-specific cost calculations.  */
4953   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4954                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4955                suggested_unroll_factor);
4956
4957   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4958       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4959       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4960                     *suggested_unroll_factor,
4961                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4962     {
4963       if (dump_enabled_p ())
4964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4965                          "can't unroll as unrolled vectorization factor larger"
4966                          " than maximum vectorization factor: "
4967                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4968                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4969       *suggested_unroll_factor = 1;
4970     }
4971
4972   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4973
4974   if (dump_enabled_p ())
4975     {
4976       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4977       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4978                    vec_inside_cost);
4979       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4980                    vec_prologue_cost);
4981       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4982                    vec_epilogue_cost);
4983       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4984                    scalar_single_iter_cost);
4985       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4986                    scalar_outside_cost);
4987       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4988                    vec_outside_cost);
4989       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4990                    peel_iters_prologue);
4991       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4992                    peel_iters_epilogue);
4993     }
4994
4995   /* Calculate number of iterations required to make the vector version
4996      profitable, relative to the loop bodies only.  The following condition
4997      must hold true:
4998      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4999      where
5000      SIC = scalar iteration cost, VIC = vector iteration cost,
5001      VOC = vector outside cost, VF = vectorization factor,
5002      NPEEL = prologue iterations + epilogue iterations,
5003      SOC = scalar outside cost for run time cost model check.  */
5004
5005   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5006                           - vec_inside_cost);
5007   if (saving_per_viter <= 0)
5008     {
5009       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5010         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5011                     "vectorization did not happen for a simd loop");
5012
5013       if (dump_enabled_p ())
5014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015                          "cost model: the vector iteration cost = %d "
5016                          "divided by the scalar iteration cost = %d "
5017                          "is greater or equal to the vectorization factor = %d"
5018                          ".\n",
5019                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5020       *ret_min_profitable_niters = -1;
5021       *ret_min_profitable_estimate = -1;
5022       return;
5023     }
5024
5025   /* ??? The "if" arm is written to handle all cases; see below for what
5026      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5027   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5028     {
5029       /* Rewriting the condition above in terms of the number of
5030          vector iterations (vniters) rather than the number of
5031          scalar iterations (niters) gives:
5032
5033          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5034
5035          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5036
5037          For integer N, X and Y when X > 0:
5038
5039          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5040       int outside_overhead = (vec_outside_cost
5041                               - scalar_single_iter_cost * peel_iters_prologue
5042                               - scalar_single_iter_cost * peel_iters_epilogue
5043                               - scalar_outside_cost);
5044       /* We're only interested in cases that require at least one
5045          vector iteration.  */
5046       int min_vec_niters = 1;
5047       if (outside_overhead > 0)
5048         min_vec_niters = outside_overhead / saving_per_viter + 1;
5049
5050       if (dump_enabled_p ())
5051         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5052                      min_vec_niters);
5053
5054       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5055         {
5056           /* Now that we know the minimum number of vector iterations,
5057              find the minimum niters for which the scalar cost is larger:
5058
5059              SIC * niters > VIC * vniters + VOC - SOC
5060
5061              We know that the minimum niters is no more than
5062              vniters * VF + NPEEL, but it might be (and often is) less
5063              than that if a partial vector iteration is cheaper than the
5064              equivalent scalar code.  */
5065           int threshold = (vec_inside_cost * min_vec_niters
5066                            + vec_outside_cost
5067                            - scalar_outside_cost);
5068           if (threshold <= 0)
5069             min_profitable_iters = 1;
5070           else
5071             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5072         }
5073       else
5074         /* Convert the number of vector iterations into a number of
5075            scalar iterations.  */
5076         min_profitable_iters = (min_vec_niters * assumed_vf
5077                                 + peel_iters_prologue
5078                                 + peel_iters_epilogue);
5079     }
5080   else
5081     {
5082       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5083                               * assumed_vf
5084                               - vec_inside_cost * peel_iters_prologue
5085                               - vec_inside_cost * peel_iters_epilogue);
5086       if (min_profitable_iters <= 0)
5087         min_profitable_iters = 0;
5088       else
5089         {
5090           min_profitable_iters /= saving_per_viter;
5091
5092           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5093               <= (((int) vec_inside_cost * min_profitable_iters)
5094                   + (((int) vec_outside_cost - scalar_outside_cost)
5095                      * assumed_vf)))
5096             min_profitable_iters++;
5097         }
5098     }
5099
5100   if (dump_enabled_p ())
5101     dump_printf (MSG_NOTE,
5102                  "  Calculated minimum iters for profitability: %d\n",
5103                  min_profitable_iters);
5104
5105   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5106       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5107     /* We want the vectorized loop to execute at least once.  */
5108     min_profitable_iters = assumed_vf + peel_iters_prologue;
5109   else if (min_profitable_iters < peel_iters_prologue)
5110     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5111        vectorized loop executes at least once.  */
5112     min_profitable_iters = peel_iters_prologue;
5113
5114   if (dump_enabled_p ())
5115     dump_printf_loc (MSG_NOTE, vect_location,
5116                      "  Runtime profitability threshold = %d\n",
5117                      min_profitable_iters);
5118
5119   *ret_min_profitable_niters = min_profitable_iters;
5120
5121   /* Calculate number of iterations required to make the vector version
5122      profitable, relative to the loop bodies only.
5123
5124      Non-vectorized variant is SIC * niters and it must win over vector
5125      variant on the expected loop trip count.  The following condition must hold true:
5126      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5127
5128   if (vec_outside_cost <= 0)
5129     min_profitable_estimate = 0;
5130   /* ??? This "else if" arm is written to handle all cases; see below for
5131      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5132   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5133     {
5134       /* This is a repeat of the code above, but with + SOC rather
5135          than - SOC.  */
5136       int outside_overhead = (vec_outside_cost
5137                               - scalar_single_iter_cost * peel_iters_prologue
5138                               - scalar_single_iter_cost * peel_iters_epilogue
5139                               + scalar_outside_cost);
5140       int min_vec_niters = 1;
5141       if (outside_overhead > 0)
5142         min_vec_niters = outside_overhead / saving_per_viter + 1;
5143
5144       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5145         {
5146           int threshold = (vec_inside_cost * min_vec_niters
5147                            + vec_outside_cost
5148                            + scalar_outside_cost);
5149           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5150         }
5151       else
5152         min_profitable_estimate = (min_vec_niters * assumed_vf
5153                                    + peel_iters_prologue
5154                                    + peel_iters_epilogue);
5155     }
5156   else
5157     {
5158       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5159                                  * assumed_vf
5160                                  - vec_inside_cost * peel_iters_prologue
5161                                  - vec_inside_cost * peel_iters_epilogue)
5162                                  / ((scalar_single_iter_cost * assumed_vf)
5163                                    - vec_inside_cost);
5164     }
5165   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5166   if (dump_enabled_p ())
5167     dump_printf_loc (MSG_NOTE, vect_location,
5168                      "  Static estimate profitability threshold = %d\n",
5169                      min_profitable_estimate);
5170
5171   *ret_min_profitable_estimate = min_profitable_estimate;
5172 }
5173
5174 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5175    vector elements (not bits) for a vector with NELT elements.  */
5176 static void
5177 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5178                               vec_perm_builder *sel)
5179 {
5180   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5181      by vec_perm_indices.  */
5182   sel->new_vector (nelt, 1, 3);
5183   for (unsigned int i = 0; i < 3; i++)
5184     sel->quick_push (i + offset);
5185 }
5186
5187 /* Checks whether the target supports whole-vector shifts for vectors of mode
5188    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5189    it supports vec_perm_const with masks for all necessary shift amounts.  */
5190 static bool
5191 have_whole_vector_shift (machine_mode mode)
5192 {
5193   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5194     return true;
5195
5196   /* Variable-length vectors should be handled via the optab.  */
5197   unsigned int nelt;
5198   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5199     return false;
5200
5201   vec_perm_builder sel;
5202   vec_perm_indices indices;
5203   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5204     {
5205       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5206       indices.new_vector (sel, 2, nelt);
5207       if (!can_vec_perm_const_p (mode, mode, indices, false))
5208         return false;
5209     }
5210   return true;
5211 }
5212
5213 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5214    multiplication operands have differing signs and (b) we intend
5215    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5216    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5217
5218 static bool
5219 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5220                                  stmt_vec_info stmt_info)
5221 {
5222   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5223   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5224     return false;
5225
5226   tree rhs1 = gimple_assign_rhs1 (assign);
5227   tree rhs2 = gimple_assign_rhs2 (assign);
5228   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5229     return false;
5230
5231   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5232   gcc_assert (reduc_info->is_reduc_info);
5233   return !directly_supported_p (DOT_PROD_EXPR,
5234                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5235                                 optab_vector_mixed_sign);
5236 }
5237
5238 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5239    functions. Design better to avoid maintenance issues.  */
5240
5241 /* Function vect_model_reduction_cost.
5242
5243    Models cost for a reduction operation, including the vector ops
5244    generated within the strip-mine loop in some cases, the initial
5245    definition before the loop, and the epilogue code that must be generated.  */
5246
5247 static void
5248 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5249                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5250                            vect_reduction_type reduction_type,
5251                            int ncopies, stmt_vector_for_cost *cost_vec)
5252 {
5253   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5254   tree vectype;
5255   machine_mode mode;
5256   class loop *loop = NULL;
5257
5258   if (loop_vinfo)
5259     loop = LOOP_VINFO_LOOP (loop_vinfo);
5260
5261   /* Condition reductions generate two reductions in the loop.  */
5262   if (reduction_type == COND_REDUCTION)
5263     ncopies *= 2;
5264
5265   vectype = STMT_VINFO_VECTYPE (stmt_info);
5266   mode = TYPE_MODE (vectype);
5267   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5268
5269   gimple_match_op op;
5270   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5271     gcc_unreachable ();
5272
5273   bool emulated_mixed_dot_prod
5274     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5275   if (reduction_type == EXTRACT_LAST_REDUCTION)
5276     /* No extra instructions are needed in the prologue.  The loop body
5277        operations are costed in vectorizable_condition.  */
5278     inside_cost = 0;
5279   else if (reduction_type == FOLD_LEFT_REDUCTION)
5280     {
5281       /* No extra instructions needed in the prologue.  */
5282       prologue_cost = 0;
5283
5284       if (reduc_fn != IFN_LAST)
5285         /* Count one reduction-like operation per vector.  */
5286         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5287                                         stmt_info, 0, vect_body);
5288       else
5289         {
5290           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5291           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5292           inside_cost = record_stmt_cost (cost_vec, nelements,
5293                                           vec_to_scalar, stmt_info, 0,
5294                                           vect_body);
5295           inside_cost += record_stmt_cost (cost_vec, nelements,
5296                                            scalar_stmt, stmt_info, 0,
5297                                            vect_body);
5298         }
5299     }
5300   else
5301     {
5302       /* Add in the cost of the initial definitions.  */
5303       int prologue_stmts;
5304       if (reduction_type == COND_REDUCTION)
5305         /* For cond reductions we have four vectors: initial index, step,
5306            initial result of the data reduction, initial value of the index
5307            reduction.  */
5308         prologue_stmts = 4;
5309       else if (emulated_mixed_dot_prod)
5310         /* We need the initial reduction value and two invariants:
5311            one that contains the minimum signed value and one that
5312            contains half of its negative.  */
5313         prologue_stmts = 3;
5314       else
5315         prologue_stmts = 1;
5316       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5317                                          scalar_to_vec, stmt_info, 0,
5318                                          vect_prologue);
5319     }
5320
5321   /* Determine cost of epilogue code.
5322
5323      We have a reduction operator that will reduce the vector in one statement.
5324      Also requires scalar extract.  */
5325
5326   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5327     {
5328       if (reduc_fn != IFN_LAST)
5329         {
5330           if (reduction_type == COND_REDUCTION)
5331             {
5332               /* An EQ stmt and an COND_EXPR stmt.  */
5333               epilogue_cost += record_stmt_cost (cost_vec, 2,
5334                                                  vector_stmt, stmt_info, 0,
5335                                                  vect_epilogue);
5336               /* Reduction of the max index and a reduction of the found
5337                  values.  */
5338               epilogue_cost += record_stmt_cost (cost_vec, 2,
5339                                                  vec_to_scalar, stmt_info, 0,
5340                                                  vect_epilogue);
5341               /* A broadcast of the max value.  */
5342               epilogue_cost += record_stmt_cost (cost_vec, 1,
5343                                                  scalar_to_vec, stmt_info, 0,
5344                                                  vect_epilogue);
5345             }
5346           else
5347             {
5348               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5349                                                  stmt_info, 0, vect_epilogue);
5350               epilogue_cost += record_stmt_cost (cost_vec, 1,
5351                                                  vec_to_scalar, stmt_info, 0,
5352                                                  vect_epilogue);
5353             }
5354         }
5355       else if (reduction_type == COND_REDUCTION)
5356         {
5357           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5358           /* Extraction of scalar elements.  */
5359           epilogue_cost += record_stmt_cost (cost_vec,
5360                                              2 * estimated_nunits,
5361                                              vec_to_scalar, stmt_info, 0,
5362                                              vect_epilogue);
5363           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5364           epilogue_cost += record_stmt_cost (cost_vec,
5365                                              2 * estimated_nunits - 3,
5366                                              scalar_stmt, stmt_info, 0,
5367                                              vect_epilogue);
5368         }
5369       else if (reduction_type == EXTRACT_LAST_REDUCTION
5370                || reduction_type == FOLD_LEFT_REDUCTION)
5371         /* No extra instructions need in the epilogue.  */
5372         ;
5373       else
5374         {
5375           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5376           tree bitsize = TYPE_SIZE (op.type);
5377           int element_bitsize = tree_to_uhwi (bitsize);
5378           int nelements = vec_size_in_bits / element_bitsize;
5379
5380           if (op.code == COND_EXPR)
5381             op.code = MAX_EXPR;
5382
5383           /* We have a whole vector shift available.  */
5384           if (VECTOR_MODE_P (mode)
5385               && directly_supported_p (op.code, vectype)
5386               && have_whole_vector_shift (mode))
5387             {
5388               /* Final reduction via vector shifts and the reduction operator.
5389                  Also requires scalar extract.  */
5390               epilogue_cost += record_stmt_cost (cost_vec,
5391                                                  exact_log2 (nelements) * 2,
5392                                                  vector_stmt, stmt_info, 0,
5393                                                  vect_epilogue);
5394               epilogue_cost += record_stmt_cost (cost_vec, 1,
5395                                                  vec_to_scalar, stmt_info, 0,
5396                                                  vect_epilogue);
5397             }
5398           else
5399             /* Use extracts and reduction op for final reduction.  For N
5400                elements, we have N extracts and N-1 reduction ops.  */
5401             epilogue_cost += record_stmt_cost (cost_vec,
5402                                                nelements + nelements - 1,
5403                                                vector_stmt, stmt_info, 0,
5404                                                vect_epilogue);
5405         }
5406     }
5407
5408   if (dump_enabled_p ())
5409     dump_printf (MSG_NOTE,
5410                  "vect_model_reduction_cost: inside_cost = %d, "
5411                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5412                  prologue_cost, epilogue_cost);
5413 }
5414
5415 /* SEQ is a sequence of instructions that initialize the reduction
5416    described by REDUC_INFO.  Emit them in the appropriate place.  */
5417
5418 static void
5419 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5420                                 stmt_vec_info reduc_info, gimple *seq)
5421 {
5422   if (reduc_info->reused_accumulator)
5423     {
5424       /* When reusing an accumulator from the main loop, we only need
5425          initialization instructions if the main loop can be skipped.
5426          In that case, emit the initialization instructions at the end
5427          of the guard block that does the skip.  */
5428       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5429       gcc_assert (skip_edge);
5430       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5431       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5432     }
5433   else
5434     {
5435       /* The normal case: emit the initialization instructions on the
5436          preheader edge.  */
5437       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5438       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5439     }
5440 }
5441
5442 /* Function get_initial_def_for_reduction
5443
5444    Input:
5445    REDUC_INFO - the info_for_reduction
5446    INIT_VAL - the initial value of the reduction variable
5447    NEUTRAL_OP - a value that has no effect on the reduction, as per
5448                 neutral_op_for_reduction
5449
5450    Output:
5451    Return a vector variable, initialized according to the operation that
5452         STMT_VINFO performs. This vector will be used as the initial value
5453         of the vector of partial results.
5454
5455    The value we need is a vector in which element 0 has value INIT_VAL
5456    and every other element has value NEUTRAL_OP.  */
5457
5458 static tree
5459 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5460                                stmt_vec_info reduc_info,
5461                                tree init_val, tree neutral_op)
5462 {
5463   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5464   tree scalar_type = TREE_TYPE (init_val);
5465   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5466   tree init_def;
5467   gimple_seq stmts = NULL;
5468
5469   gcc_assert (vectype);
5470
5471   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5472               || SCALAR_FLOAT_TYPE_P (scalar_type));
5473
5474   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5475               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5476
5477   if (operand_equal_p (init_val, neutral_op))
5478     {
5479       /* If both elements are equal then the vector described above is
5480          just a splat.  */
5481       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5482       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5483     }
5484   else
5485     {
5486       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5487       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5488       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5489         {
5490           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5491              element 0.  */
5492           init_def = gimple_build_vector_from_val (&stmts, vectype,
5493                                                    neutral_op);
5494           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5495                                    vectype, init_def, init_val);
5496         }
5497       else
5498         {
5499           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5500           tree_vector_builder elts (vectype, 1, 2);
5501           elts.quick_push (init_val);
5502           elts.quick_push (neutral_op);
5503           init_def = gimple_build_vector (&stmts, &elts);
5504         }
5505     }
5506
5507   if (stmts)
5508     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5509   return init_def;
5510 }
5511
5512 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5513    which performs a reduction involving GROUP_SIZE scalar statements.
5514    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5515    is nonnull, introducing extra elements of that value will not change the
5516    result.  */
5517
5518 static void
5519 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5520                                 stmt_vec_info reduc_info,
5521                                 vec<tree> *vec_oprnds,
5522                                 unsigned int number_of_vectors,
5523                                 unsigned int group_size, tree neutral_op)
5524 {
5525   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5526   unsigned HOST_WIDE_INT nunits;
5527   unsigned j, number_of_places_left_in_vector;
5528   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5529   unsigned int i;
5530
5531   gcc_assert (group_size == initial_values.length () || neutral_op);
5532
5533   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5534      created vectors. It is greater than 1 if unrolling is performed.
5535
5536      For example, we have two scalar operands, s1 and s2 (e.g., group of
5537      strided accesses of size two), while NUNITS is four (i.e., four scalars
5538      of this type can be packed in a vector).  The output vector will contain
5539      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5540      will be 2).
5541
5542      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5543      vectors containing the operands.
5544
5545      For example, NUNITS is four as before, and the group size is 8
5546      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5547      {s5, s6, s7, s8}.  */
5548
5549   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5550     nunits = group_size;
5551
5552   number_of_places_left_in_vector = nunits;
5553   bool constant_p = true;
5554   tree_vector_builder elts (vector_type, nunits, 1);
5555   elts.quick_grow (nunits);
5556   gimple_seq ctor_seq = NULL;
5557   for (j = 0; j < nunits * number_of_vectors; ++j)
5558     {
5559       tree op;
5560       i = j % group_size;
5561
5562       /* Get the def before the loop.  In reduction chain we have only
5563          one initial value.  Else we have as many as PHIs in the group.  */
5564       if (i >= initial_values.length () || (j > i && neutral_op))
5565         op = neutral_op;
5566       else
5567         op = initial_values[i];
5568
5569       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5570       number_of_places_left_in_vector--;
5571       elts[nunits - number_of_places_left_in_vector - 1] = op;
5572       if (!CONSTANT_CLASS_P (op))
5573         constant_p = false;
5574
5575       if (number_of_places_left_in_vector == 0)
5576         {
5577           tree init;
5578           if (constant_p && !neutral_op
5579               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5580               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5581             /* Build the vector directly from ELTS.  */
5582             init = gimple_build_vector (&ctor_seq, &elts);
5583           else if (neutral_op)
5584             {
5585               /* Build a vector of the neutral value and shift the
5586                  other elements into place.  */
5587               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5588                                                    neutral_op);
5589               int k = nunits;
5590               while (k > 0 && elts[k - 1] == neutral_op)
5591                 k -= 1;
5592               while (k > 0)
5593                 {
5594                   k -= 1;
5595                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5596                                        vector_type, init, elts[k]);
5597                 }
5598             }
5599           else
5600             {
5601               /* First time round, duplicate ELTS to fill the
5602                  required number of vectors.  */
5603               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5604                                         elts, number_of_vectors, *vec_oprnds);
5605               break;
5606             }
5607           vec_oprnds->quick_push (init);
5608
5609           number_of_places_left_in_vector = nunits;
5610           elts.new_vector (vector_type, nunits, 1);
5611           elts.quick_grow (nunits);
5612           constant_p = true;
5613         }
5614     }
5615   if (ctor_seq != NULL)
5616     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5617 }
5618
5619 /* For a statement STMT_INFO taking part in a reduction operation return
5620    the stmt_vec_info the meta information is stored on.  */
5621
5622 stmt_vec_info
5623 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5624 {
5625   stmt_info = vect_orig_stmt (stmt_info);
5626   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5627   if (!is_a <gphi *> (stmt_info->stmt)
5628       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5629     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5630   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5631   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5632     {
5633       if (gimple_phi_num_args (phi) == 1)
5634         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5635     }
5636   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5637     {
5638       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5639       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5640         stmt_info = info;
5641     }
5642   return stmt_info;
5643 }
5644
5645 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5646    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5647    return false.  */
5648
5649 static bool
5650 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5651                                 stmt_vec_info reduc_info)
5652 {
5653   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5654   if (!main_loop_vinfo)
5655     return false;
5656
5657   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5658     return false;
5659
5660   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5661   auto_vec<tree, 16> main_loop_results (num_phis);
5662   auto_vec<tree, 16> initial_values (num_phis);
5663   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5664     {
5665       /* The epilogue loop can be entered either from the main loop or
5666          from an earlier guard block.  */
5667       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5668       for (tree incoming_value : reduc_info->reduc_initial_values)
5669         {
5670           /* Look for:
5671
5672                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5673                                     INITIAL_VALUE(guard block)>.  */
5674           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5675
5676           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5677           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5678
5679           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5680           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5681
5682           main_loop_results.quick_push (from_main_loop);
5683           initial_values.quick_push (from_skip);
5684         }
5685     }
5686   else
5687     /* The main loop dominates the epilogue loop.  */
5688     main_loop_results.splice (reduc_info->reduc_initial_values);
5689
5690   /* See if the main loop has the kind of accumulator we need.  */
5691   vect_reusable_accumulator *accumulator
5692     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5693   if (!accumulator
5694       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5695       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5696                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5697     return false;
5698
5699   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5700   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5701   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5702   unsigned HOST_WIDE_INT m;
5703   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5704                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5705     return false;
5706   /* Check the intermediate vector types and operations are available.  */
5707   tree prev_vectype = old_vectype;
5708   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5709   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5710     {
5711       intermediate_nunits = exact_div (intermediate_nunits, 2);
5712       tree intermediate_vectype = get_related_vectype_for_scalar_type
5713         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5714       if (!intermediate_vectype
5715           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5716                                     intermediate_vectype)
5717           || !can_vec_extract (TYPE_MODE (prev_vectype),
5718                                TYPE_MODE (intermediate_vectype)))
5719         return false;
5720       prev_vectype = intermediate_vectype;
5721     }
5722
5723   /* Non-SLP reductions might apply an adjustment after the reduction
5724      operation, in order to simplify the initialization of the accumulator.
5725      If the epilogue loop carries on from where the main loop left off,
5726      it should apply the same adjustment to the final reduction result.
5727
5728      If the epilogue loop can also be entered directly (rather than via
5729      the main loop), we need to be able to handle that case in the same way,
5730      with the same adjustment.  (In principle we could add a PHI node
5731      to select the correct adjustment, but in practice that shouldn't be
5732      necessary.)  */
5733   tree main_adjustment
5734     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5735   if (loop_vinfo->main_loop_edge && main_adjustment)
5736     {
5737       gcc_assert (num_phis == 1);
5738       tree initial_value = initial_values[0];
5739       /* Check that we can use INITIAL_VALUE as the adjustment and
5740          initialize the accumulator with a neutral value instead.  */
5741       if (!operand_equal_p (initial_value, main_adjustment))
5742         return false;
5743       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5744       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5745                                                     code, initial_value);
5746     }
5747   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5748   reduc_info->reduc_initial_values.truncate (0);
5749   reduc_info->reduc_initial_values.splice (initial_values);
5750   reduc_info->reused_accumulator = accumulator;
5751   return true;
5752 }
5753
5754 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5755    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5756
5757 static tree
5758 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5759                             gimple_seq *seq)
5760 {
5761   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5762   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5763   tree stype = TREE_TYPE (vectype);
5764   tree new_temp = vec_def;
5765   while (nunits > nunits1)
5766     {
5767       nunits /= 2;
5768       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5769                                                            stype, nunits);
5770       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5771
5772       /* The target has to make sure we support lowpart/highpart
5773          extraction, either via direct vector extract or through
5774          an integer mode punning.  */
5775       tree dst1, dst2;
5776       gimple *epilog_stmt;
5777       if (convert_optab_handler (vec_extract_optab,
5778                                  TYPE_MODE (TREE_TYPE (new_temp)),
5779                                  TYPE_MODE (vectype1))
5780           != CODE_FOR_nothing)
5781         {
5782           /* Extract sub-vectors directly once vec_extract becomes
5783              a conversion optab.  */
5784           dst1 = make_ssa_name (vectype1);
5785           epilog_stmt
5786               = gimple_build_assign (dst1, BIT_FIELD_REF,
5787                                      build3 (BIT_FIELD_REF, vectype1,
5788                                              new_temp, TYPE_SIZE (vectype1),
5789                                              bitsize_int (0)));
5790           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5791           dst2 =  make_ssa_name (vectype1);
5792           epilog_stmt
5793               = gimple_build_assign (dst2, BIT_FIELD_REF,
5794                                      build3 (BIT_FIELD_REF, vectype1,
5795                                              new_temp, TYPE_SIZE (vectype1),
5796                                              bitsize_int (bitsize)));
5797           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5798         }
5799       else
5800         {
5801           /* Extract via punning to appropriately sized integer mode
5802              vector.  */
5803           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5804           tree etype = build_vector_type (eltype, 2);
5805           gcc_assert (convert_optab_handler (vec_extract_optab,
5806                                              TYPE_MODE (etype),
5807                                              TYPE_MODE (eltype))
5808                       != CODE_FOR_nothing);
5809           tree tem = make_ssa_name (etype);
5810           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5811                                              build1 (VIEW_CONVERT_EXPR,
5812                                                      etype, new_temp));
5813           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5814           new_temp = tem;
5815           tem = make_ssa_name (eltype);
5816           epilog_stmt
5817               = gimple_build_assign (tem, BIT_FIELD_REF,
5818                                      build3 (BIT_FIELD_REF, eltype,
5819                                              new_temp, TYPE_SIZE (eltype),
5820                                              bitsize_int (0)));
5821           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5822           dst1 = make_ssa_name (vectype1);
5823           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5824                                              build1 (VIEW_CONVERT_EXPR,
5825                                                      vectype1, tem));
5826           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5827           tem = make_ssa_name (eltype);
5828           epilog_stmt
5829               = gimple_build_assign (tem, BIT_FIELD_REF,
5830                                      build3 (BIT_FIELD_REF, eltype,
5831                                              new_temp, TYPE_SIZE (eltype),
5832                                              bitsize_int (bitsize)));
5833           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5834           dst2 =  make_ssa_name (vectype1);
5835           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5836                                              build1 (VIEW_CONVERT_EXPR,
5837                                                      vectype1, tem));
5838           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5839         }
5840
5841       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5842     }
5843
5844   return new_temp;
5845 }
5846
5847 /* Function vect_create_epilog_for_reduction
5848
5849    Create code at the loop-epilog to finalize the result of a reduction
5850    computation.
5851
5852    STMT_INFO is the scalar reduction stmt that is being vectorized.
5853    SLP_NODE is an SLP node containing a group of reduction statements. The
5854      first one in this group is STMT_INFO.
5855    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5856    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5857      (counting from 0)
5858
5859    This function:
5860    1. Completes the reduction def-use cycles.
5861    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5862       by calling the function specified by REDUC_FN if available, or by
5863       other means (whole-vector shifts or a scalar loop).
5864       The function also creates a new phi node at the loop exit to preserve
5865       loop-closed form, as illustrated below.
5866
5867      The flow at the entry to this function:
5868
5869         loop:
5870           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5871           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5872           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5873         loop_exit:
5874           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5875           use <s_out0>
5876           use <s_out0>
5877
5878      The above is transformed by this function into:
5879
5880         loop:
5881           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5882           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5883           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5884         loop_exit:
5885           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5886           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5887           v_out2 = reduce <v_out1>
5888           s_out3 = extract_field <v_out2, 0>
5889           s_out4 = adjust_result <s_out3>
5890           use <s_out4>
5891           use <s_out4>
5892 */
5893
5894 static void
5895 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5896                                   stmt_vec_info stmt_info,
5897                                   slp_tree slp_node,
5898                                   slp_instance slp_node_instance)
5899 {
5900   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5901   gcc_assert (reduc_info->is_reduc_info);
5902   /* For double reductions we need to get at the inner loop reduction
5903      stmt which has the meta info attached.  Our stmt_info is that of the
5904      loop-closed PHI of the inner loop which we remember as
5905      def for the reduction PHI generation.  */
5906   bool double_reduc = false;
5907   stmt_vec_info rdef_info = stmt_info;
5908   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5909     {
5910       gcc_assert (!slp_node);
5911       double_reduc = true;
5912       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5913                                             (stmt_info->stmt, 0));
5914       stmt_info = vect_stmt_to_vectorize (stmt_info);
5915     }
5916   gphi *reduc_def_stmt
5917     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5918   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5919   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5920   tree vectype;
5921   machine_mode mode;
5922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5923   basic_block exit_bb;
5924   tree scalar_dest;
5925   tree scalar_type;
5926   gimple *new_phi = NULL, *phi = NULL;
5927   gimple_stmt_iterator exit_gsi;
5928   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5929   gimple *epilog_stmt = NULL;
5930   gimple *exit_phi;
5931   tree bitsize;
5932   tree def;
5933   tree orig_name, scalar_result;
5934   imm_use_iterator imm_iter, phi_imm_iter;
5935   use_operand_p use_p, phi_use_p;
5936   gimple *use_stmt;
5937   auto_vec<tree> reduc_inputs;
5938   int j, i;
5939   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5940   unsigned int group_size = 1, k;
5941   auto_vec<gimple *> phis;
5942   /* SLP reduction without reduction chain, e.g.,
5943      # a1 = phi <a2, a0>
5944      # b1 = phi <b2, b0>
5945      a2 = operation (a1)
5946      b2 = operation (b1)  */
5947   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5948   bool direct_slp_reduc;
5949   tree induction_index = NULL_TREE;
5950
5951   if (slp_node)
5952     group_size = SLP_TREE_LANES (slp_node);
5953
5954   if (nested_in_vect_loop_p (loop, stmt_info))
5955     {
5956       outer_loop = loop;
5957       loop = loop->inner;
5958       gcc_assert (!slp_node && double_reduc);
5959     }
5960
5961   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5962   gcc_assert (vectype);
5963   mode = TYPE_MODE (vectype);
5964
5965   tree induc_val = NULL_TREE;
5966   tree adjustment_def = NULL;
5967   if (slp_node)
5968     ;
5969   else
5970     {
5971       /* Optimize: for induction condition reduction, if we can't use zero
5972          for induc_val, use initial_def.  */
5973       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5974         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5975       else if (double_reduc)
5976         ;
5977       else
5978         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5979     }
5980
5981   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5982   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5983   if (slp_reduc)
5984     /* All statements produce live-out values.  */
5985     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5986   else if (slp_node)
5987     {
5988       /* The last statement in the reduction chain produces the live-out
5989          value.  Note SLP optimization can shuffle scalar stmts to
5990          optimize permutations so we have to search for the last stmt.  */
5991       for (k = 0; k < group_size; ++k)
5992         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5993           {
5994             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5995             break;
5996           }
5997     }
5998
5999   unsigned vec_num;
6000   int ncopies;
6001   if (slp_node)
6002     {
6003       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6004       ncopies = 1;
6005     }
6006   else
6007     {
6008       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6009       vec_num = 1;
6010       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6011     }
6012
6013   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6014      which is updated with the current index of the loop for every match of
6015      the original loop's cond_expr (VEC_STMT).  This results in a vector
6016      containing the last time the condition passed for that vector lane.
6017      The first match will be a 1 to allow 0 to be used for non-matching
6018      indexes.  If there are no matches at all then the vector will be all
6019      zeroes.
6020
6021      PR92772: This algorithm is broken for architectures that support
6022      masked vectors, but do not provide fold_extract_last.  */
6023   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6024     {
6025       auto_vec<std::pair<tree, bool>, 2> ccompares;
6026       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6027       cond_info = vect_stmt_to_vectorize (cond_info);
6028       while (cond_info != reduc_info)
6029         {
6030           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6031             {
6032               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6033               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6034               ccompares.safe_push
6035                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6036                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6037             }
6038           cond_info
6039             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6040                                                  1 + STMT_VINFO_REDUC_IDX
6041                                                         (cond_info)));
6042           cond_info = vect_stmt_to_vectorize (cond_info);
6043         }
6044       gcc_assert (ccompares.length () != 0);
6045
6046       tree indx_before_incr, indx_after_incr;
6047       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6048       int scalar_precision
6049         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6050       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6051       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6052         (TYPE_MODE (vectype), cr_index_scalar_type,
6053          TYPE_VECTOR_SUBPARTS (vectype));
6054
6055       /* First we create a simple vector induction variable which starts
6056          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6057          vector size (STEP).  */
6058
6059       /* Create a {1,2,3,...} vector.  */
6060       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6061
6062       /* Create a vector of the step value.  */
6063       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6064       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6065
6066       /* Create an induction variable.  */
6067       gimple_stmt_iterator incr_gsi;
6068       bool insert_after;
6069       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6070       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6071                  insert_after, &indx_before_incr, &indx_after_incr);
6072
6073       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6074          filled with zeros (VEC_ZERO).  */
6075
6076       /* Create a vector of 0s.  */
6077       tree zero = build_zero_cst (cr_index_scalar_type);
6078       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6079
6080       /* Create a vector phi node.  */
6081       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6082       new_phi = create_phi_node (new_phi_tree, loop->header);
6083       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6084                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6085
6086       /* Now take the condition from the loops original cond_exprs
6087          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6088          every match uses values from the induction variable
6089          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6090          (NEW_PHI_TREE).
6091          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6092          the new cond_expr (INDEX_COND_EXPR).  */
6093       gimple_seq stmts = NULL;
6094       for (int i = ccompares.length () - 1; i != -1; --i)
6095         {
6096           tree ccompare = ccompares[i].first;
6097           if (ccompares[i].second)
6098             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6099                                          cr_index_vector_type,
6100                                          ccompare,
6101                                          indx_before_incr, new_phi_tree);
6102           else
6103             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6104                                          cr_index_vector_type,
6105                                          ccompare,
6106                                          new_phi_tree, indx_before_incr);
6107         }
6108       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6109
6110       /* Update the phi with the vec cond.  */
6111       induction_index = new_phi_tree;
6112       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6113                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6114     }
6115
6116   /* 2. Create epilog code.
6117         The reduction epilog code operates across the elements of the vector
6118         of partial results computed by the vectorized loop.
6119         The reduction epilog code consists of:
6120
6121         step 1: compute the scalar result in a vector (v_out2)
6122         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6123         step 3: adjust the scalar result (s_out3) if needed.
6124
6125         Step 1 can be accomplished using one the following three schemes:
6126           (scheme 1) using reduc_fn, if available.
6127           (scheme 2) using whole-vector shifts, if available.
6128           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6129                      combined.
6130
6131           The overall epilog code looks like this:
6132
6133           s_out0 = phi <s_loop>         # original EXIT_PHI
6134           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6135           v_out2 = reduce <v_out1>              # step 1
6136           s_out3 = extract_field <v_out2, 0>    # step 2
6137           s_out4 = adjust_result <s_out3>       # step 3
6138
6139           (step 3 is optional, and steps 1 and 2 may be combined).
6140           Lastly, the uses of s_out0 are replaced by s_out4.  */
6141
6142
6143   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6144          v_out1 = phi <VECT_DEF>
6145          Store them in NEW_PHIS.  */
6146   if (double_reduc)
6147     loop = outer_loop;
6148   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6149   exit_gsi = gsi_after_labels (exit_bb);
6150   reduc_inputs.create (slp_node ? vec_num : ncopies);
6151   for (unsigned i = 0; i < vec_num; i++)
6152     {
6153       gimple_seq stmts = NULL;
6154       if (slp_node)
6155         def = vect_get_slp_vect_def (slp_node, i);
6156       else
6157         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6158       for (j = 0; j < ncopies; j++)
6159         {
6160           tree new_def = copy_ssa_name (def);
6161           phi = create_phi_node (new_def, exit_bb);
6162           if (j)
6163             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6164           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6165           new_def = gimple_convert (&stmts, vectype, new_def);
6166           reduc_inputs.quick_push (new_def);
6167         }
6168       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6169     }
6170
6171   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6172          (i.e. when reduc_fn is not available) and in the final adjustment
6173          code (if needed).  Also get the original scalar reduction variable as
6174          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6175          represents a reduction pattern), the tree-code and scalar-def are
6176          taken from the original stmt that the pattern-stmt (STMT) replaces.
6177          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6178          are taken from STMT.  */
6179
6180   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6181   if (orig_stmt_info != stmt_info)
6182     {
6183       /* Reduction pattern  */
6184       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6185       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6186     }
6187
6188   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6189   scalar_type = TREE_TYPE (scalar_dest);
6190   scalar_results.truncate (0);
6191   scalar_results.reserve_exact (group_size);
6192   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6193   bitsize = TYPE_SIZE (scalar_type);
6194
6195   /* True if we should implement SLP_REDUC using native reduction operations
6196      instead of scalar operations.  */
6197   direct_slp_reduc = (reduc_fn != IFN_LAST
6198                       && slp_reduc
6199                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6200
6201   /* In case of reduction chain, e.g.,
6202      # a1 = phi <a3, a0>
6203      a2 = operation (a1)
6204      a3 = operation (a2),
6205
6206      we may end up with more than one vector result.  Here we reduce them
6207      to one vector.
6208
6209      The same is true for a SLP reduction, e.g.,
6210      # a1 = phi <a2, a0>
6211      # b1 = phi <b2, b0>
6212      a2 = operation (a1)
6213      b2 = operation (a2),
6214
6215      where we can end up with more than one vector as well.  We can
6216      easily accumulate vectors when the number of vector elements is
6217      a multiple of the SLP group size.
6218
6219      The same is true if we couldn't use a single defuse cycle.  */
6220   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6221       || direct_slp_reduc
6222       || (slp_reduc
6223           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6224       || ncopies > 1)
6225     {
6226       gimple_seq stmts = NULL;
6227       tree single_input = reduc_inputs[0];
6228       for (k = 1; k < reduc_inputs.length (); k++)
6229         single_input = gimple_build (&stmts, code, vectype,
6230                                      single_input, reduc_inputs[k]);
6231       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6232
6233       reduc_inputs.truncate (0);
6234       reduc_inputs.safe_push (single_input);
6235     }
6236
6237   tree orig_reduc_input = reduc_inputs[0];
6238
6239   /* If this loop is an epilogue loop that can be skipped after the
6240      main loop, we can only share a reduction operation between the
6241      main loop and the epilogue if we put it at the target of the
6242      skip edge.
6243
6244      We can still reuse accumulators if this check fails.  Doing so has
6245      the minor(?) benefit of making the epilogue loop's scalar result
6246      independent of the main loop's scalar result.  */
6247   bool unify_with_main_loop_p = false;
6248   if (reduc_info->reused_accumulator
6249       && loop_vinfo->skip_this_loop_edge
6250       && single_succ_p (exit_bb)
6251       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6252     {
6253       unify_with_main_loop_p = true;
6254
6255       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6256       reduc_inputs[0] = make_ssa_name (vectype);
6257       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6258       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6259                    UNKNOWN_LOCATION);
6260       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6261                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6262       exit_gsi = gsi_after_labels (reduc_block);
6263     }
6264
6265   /* Shouldn't be used beyond this point.  */
6266   exit_bb = nullptr;
6267
6268   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6269       && reduc_fn != IFN_LAST)
6270     {
6271       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6272          various data values where the condition matched and another vector
6273          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6274          need to extract the last matching index (which will be the index with
6275          highest value) and use this to index into the data vector.
6276          For the case where there were no matches, the data vector will contain
6277          all default values and the index vector will be all zeros.  */
6278
6279       /* Get various versions of the type of the vector of indexes.  */
6280       tree index_vec_type = TREE_TYPE (induction_index);
6281       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6282       tree index_scalar_type = TREE_TYPE (index_vec_type);
6283       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6284
6285       /* Get an unsigned integer version of the type of the data vector.  */
6286       int scalar_precision
6287         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6288       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6289       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6290                                                 vectype);
6291
6292       /* First we need to create a vector (ZERO_VEC) of zeros and another
6293          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6294          can create using a MAX reduction and then expanding.
6295          In the case where the loop never made any matches, the max index will
6296          be zero.  */
6297
6298       /* Vector of {0, 0, 0,...}.  */
6299       tree zero_vec = build_zero_cst (vectype);
6300
6301       /* Find maximum value from the vector of found indexes.  */
6302       tree max_index = make_ssa_name (index_scalar_type);
6303       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6304                                                           1, induction_index);
6305       gimple_call_set_lhs (max_index_stmt, max_index);
6306       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6307
6308       /* Vector of {max_index, max_index, max_index,...}.  */
6309       tree max_index_vec = make_ssa_name (index_vec_type);
6310       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6311                                                       max_index);
6312       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6313                                                         max_index_vec_rhs);
6314       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6315
6316       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6317          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6318          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6319          otherwise.  Only one value should match, resulting in a vector
6320          (VEC_COND) with one data value and the rest zeros.
6321          In the case where the loop never made any matches, every index will
6322          match, resulting in a vector with all data values (which will all be
6323          the default value).  */
6324
6325       /* Compare the max index vector to the vector of found indexes to find
6326          the position of the max value.  */
6327       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6328       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6329                                                       induction_index,
6330                                                       max_index_vec);
6331       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6332
6333       /* Use the compare to choose either values from the data vector or
6334          zero.  */
6335       tree vec_cond = make_ssa_name (vectype);
6336       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6337                                                    vec_compare,
6338                                                    reduc_inputs[0],
6339                                                    zero_vec);
6340       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6341
6342       /* Finally we need to extract the data value from the vector (VEC_COND)
6343          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6344          reduction, but because this doesn't exist, we can use a MAX reduction
6345          instead.  The data value might be signed or a float so we need to cast
6346          it first.
6347          In the case where the loop never made any matches, the data values are
6348          all identical, and so will reduce down correctly.  */
6349
6350       /* Make the matched data values unsigned.  */
6351       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6352       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6353                                        vec_cond);
6354       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6355                                                         VIEW_CONVERT_EXPR,
6356                                                         vec_cond_cast_rhs);
6357       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6358
6359       /* Reduce down to a scalar value.  */
6360       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6361       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6362                                                            1, vec_cond_cast);
6363       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6364       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6365
6366       /* Convert the reduced value back to the result type and set as the
6367          result.  */
6368       gimple_seq stmts = NULL;
6369       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6370                                data_reduc);
6371       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6372       scalar_results.safe_push (new_temp);
6373     }
6374   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6375            && reduc_fn == IFN_LAST)
6376     {
6377       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6378          idx = 0;
6379          idx_val = induction_index[0];
6380          val = data_reduc[0];
6381          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6382            if (induction_index[i] > idx_val)
6383              val = data_reduc[i], idx_val = induction_index[i];
6384          return val;  */
6385
6386       tree data_eltype = TREE_TYPE (vectype);
6387       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6388       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6389       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6390       /* Enforced by vectorizable_reduction, which ensures we have target
6391          support before allowing a conditional reduction on variable-length
6392          vectors.  */
6393       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6394       tree idx_val = NULL_TREE, val = NULL_TREE;
6395       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6396         {
6397           tree old_idx_val = idx_val;
6398           tree old_val = val;
6399           idx_val = make_ssa_name (idx_eltype);
6400           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6401                                              build3 (BIT_FIELD_REF, idx_eltype,
6402                                                      induction_index,
6403                                                      bitsize_int (el_size),
6404                                                      bitsize_int (off)));
6405           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6406           val = make_ssa_name (data_eltype);
6407           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6408                                              build3 (BIT_FIELD_REF,
6409                                                      data_eltype,
6410                                                      reduc_inputs[0],
6411                                                      bitsize_int (el_size),
6412                                                      bitsize_int (off)));
6413           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6414           if (off != 0)
6415             {
6416               tree new_idx_val = idx_val;
6417               if (off != v_size - el_size)
6418                 {
6419                   new_idx_val = make_ssa_name (idx_eltype);
6420                   epilog_stmt = gimple_build_assign (new_idx_val,
6421                                                      MAX_EXPR, idx_val,
6422                                                      old_idx_val);
6423                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6424                 }
6425               tree cond = make_ssa_name (boolean_type_node);
6426               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6427                                                  idx_val, old_idx_val);
6428               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6429               tree new_val = make_ssa_name (data_eltype);
6430               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6431                                                  cond, val, old_val);
6432               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6433               idx_val = new_idx_val;
6434               val = new_val;
6435             }
6436         }
6437       /* Convert the reduced value back to the result type and set as the
6438          result.  */
6439       gimple_seq stmts = NULL;
6440       val = gimple_convert (&stmts, scalar_type, val);
6441       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6442       scalar_results.safe_push (val);
6443     }
6444
6445   /* 2.3 Create the reduction code, using one of the three schemes described
6446          above. In SLP we simply need to extract all the elements from the
6447          vector (without reducing them), so we use scalar shifts.  */
6448   else if (reduc_fn != IFN_LAST && !slp_reduc)
6449     {
6450       tree tmp;
6451       tree vec_elem_type;
6452
6453       /* Case 1:  Create:
6454          v_out2 = reduc_expr <v_out1>  */
6455
6456       if (dump_enabled_p ())
6457         dump_printf_loc (MSG_NOTE, vect_location,
6458                          "Reduce using direct vector reduction.\n");
6459
6460       gimple_seq stmts = NULL;
6461       vec_elem_type = TREE_TYPE (vectype);
6462       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6463                                vec_elem_type, reduc_inputs[0]);
6464       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6465       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6466
6467       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6468           && induc_val)
6469         {
6470           /* Earlier we set the initial value to be a vector if induc_val
6471              values.  Check the result and if it is induc_val then replace
6472              with the original initial value, unless induc_val is
6473              the same as initial_def already.  */
6474           tree zcompare = make_ssa_name (boolean_type_node);
6475           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6476                                              new_temp, induc_val);
6477           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6478           tree initial_def = reduc_info->reduc_initial_values[0];
6479           tmp = make_ssa_name (new_scalar_dest);
6480           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6481                                              initial_def, new_temp);
6482           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6483           new_temp = tmp;
6484         }
6485
6486       scalar_results.safe_push (new_temp);
6487     }
6488   else if (direct_slp_reduc)
6489     {
6490       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6491          with the elements for other SLP statements replaced with the
6492          neutral value.  We can then do a normal reduction on each vector.  */
6493
6494       /* Enforced by vectorizable_reduction.  */
6495       gcc_assert (reduc_inputs.length () == 1);
6496       gcc_assert (pow2p_hwi (group_size));
6497
6498       gimple_seq seq = NULL;
6499
6500       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6501          and the same element size as VECTYPE.  */
6502       tree index = build_index_vector (vectype, 0, 1);
6503       tree index_type = TREE_TYPE (index);
6504       tree index_elt_type = TREE_TYPE (index_type);
6505       tree mask_type = truth_type_for (index_type);
6506
6507       /* Create a vector that, for each element, identifies which of
6508          the REDUC_GROUP_SIZE results should use it.  */
6509       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6510       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6511                             build_vector_from_val (index_type, index_mask));
6512
6513       /* Get a neutral vector value.  This is simply a splat of the neutral
6514          scalar value if we have one, otherwise the initial scalar value
6515          is itself a neutral value.  */
6516       tree vector_identity = NULL_TREE;
6517       tree neutral_op = NULL_TREE;
6518       if (slp_node)
6519         {
6520           tree initial_value = NULL_TREE;
6521           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6522             initial_value = reduc_info->reduc_initial_values[0];
6523           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6524                                                  initial_value, false);
6525         }
6526       if (neutral_op)
6527         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6528                                                         neutral_op);
6529       for (unsigned int i = 0; i < group_size; ++i)
6530         {
6531           /* If there's no univeral neutral value, we can use the
6532              initial scalar value from the original PHI.  This is used
6533              for MIN and MAX reduction, for example.  */
6534           if (!neutral_op)
6535             {
6536               tree scalar_value = reduc_info->reduc_initial_values[i];
6537               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6538                                              scalar_value);
6539               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6540                                                               scalar_value);
6541             }
6542
6543           /* Calculate the equivalent of:
6544
6545              sel[j] = (index[j] == i);
6546
6547              which selects the elements of REDUC_INPUTS[0] that should
6548              be included in the result.  */
6549           tree compare_val = build_int_cst (index_elt_type, i);
6550           compare_val = build_vector_from_val (index_type, compare_val);
6551           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6552                                    index, compare_val);
6553
6554           /* Calculate the equivalent of:
6555
6556              vec = seq ? reduc_inputs[0] : vector_identity;
6557
6558              VEC is now suitable for a full vector reduction.  */
6559           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6560                                    sel, reduc_inputs[0], vector_identity);
6561
6562           /* Do the reduction and convert it to the appropriate type.  */
6563           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6564                                       TREE_TYPE (vectype), vec);
6565           scalar = gimple_convert (&seq, scalar_type, scalar);
6566           scalar_results.safe_push (scalar);
6567         }
6568       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6569     }
6570   else
6571     {
6572       bool reduce_with_shift;
6573       tree vec_temp;
6574
6575       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6576
6577       /* See if the target wants to do the final (shift) reduction
6578          in a vector mode of smaller size and first reduce upper/lower
6579          halves against each other.  */
6580       enum machine_mode mode1 = mode;
6581       tree stype = TREE_TYPE (vectype);
6582       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6583       unsigned nunits1 = nunits;
6584       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6585           && reduc_inputs.length () == 1)
6586         {
6587           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6588           /* For SLP reductions we have to make sure lanes match up, but
6589              since we're doing individual element final reduction reducing
6590              vector width here is even more important.
6591              ???  We can also separate lanes with permutes, for the common
6592              case of power-of-two group-size odd/even extracts would work.  */
6593           if (slp_reduc && nunits != nunits1)
6594             {
6595               nunits1 = least_common_multiple (nunits1, group_size);
6596               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6597             }
6598         }
6599       if (!slp_reduc
6600           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6601         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6602
6603       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6604                                                            stype, nunits1);
6605       reduce_with_shift = have_whole_vector_shift (mode1);
6606       if (!VECTOR_MODE_P (mode1)
6607           || !directly_supported_p (code, vectype1))
6608         reduce_with_shift = false;
6609
6610       /* First reduce the vector to the desired vector size we should
6611          do shift reduction on by combining upper and lower halves.  */
6612       gimple_seq stmts = NULL;
6613       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6614                                              code, &stmts);
6615       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6616       reduc_inputs[0] = new_temp;
6617
6618       if (reduce_with_shift && !slp_reduc)
6619         {
6620           int element_bitsize = tree_to_uhwi (bitsize);
6621           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6622              for variable-length vectors and also requires direct target support
6623              for loop reductions.  */
6624           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6625           int nelements = vec_size_in_bits / element_bitsize;
6626           vec_perm_builder sel;
6627           vec_perm_indices indices;
6628
6629           int elt_offset;
6630
6631           tree zero_vec = build_zero_cst (vectype1);
6632           /* Case 2: Create:
6633              for (offset = nelements/2; offset >= 1; offset/=2)
6634                 {
6635                   Create:  va' = vec_shift <va, offset>
6636                   Create:  va = vop <va, va'>
6637                 }  */
6638
6639           tree rhs;
6640
6641           if (dump_enabled_p ())
6642             dump_printf_loc (MSG_NOTE, vect_location,
6643                              "Reduce using vector shifts\n");
6644
6645           gimple_seq stmts = NULL;
6646           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6647           for (elt_offset = nelements / 2;
6648                elt_offset >= 1;
6649                elt_offset /= 2)
6650             {
6651               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6652               indices.new_vector (sel, 2, nelements);
6653               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6654               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6655                                        new_temp, zero_vec, mask);
6656               new_temp = gimple_build (&stmts, code,
6657                                        vectype1, new_name, new_temp);
6658             }
6659           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6660
6661           /* 2.4  Extract the final scalar result.  Create:
6662              s_out3 = extract_field <v_out2, bitpos>  */
6663
6664           if (dump_enabled_p ())
6665             dump_printf_loc (MSG_NOTE, vect_location,
6666                              "extract scalar result\n");
6667
6668           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6669                         bitsize, bitsize_zero_node);
6670           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6671           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6672           gimple_assign_set_lhs (epilog_stmt, new_temp);
6673           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6674           scalar_results.safe_push (new_temp);
6675         }
6676       else
6677         {
6678           /* Case 3: Create:
6679              s = extract_field <v_out2, 0>
6680              for (offset = element_size;
6681                   offset < vector_size;
6682                   offset += element_size;)
6683                {
6684                  Create:  s' = extract_field <v_out2, offset>
6685                  Create:  s = op <s, s'>  // For non SLP cases
6686                }  */
6687
6688           if (dump_enabled_p ())
6689             dump_printf_loc (MSG_NOTE, vect_location,
6690                              "Reduce using scalar code.\n");
6691
6692           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6693           int element_bitsize = tree_to_uhwi (bitsize);
6694           tree compute_type = TREE_TYPE (vectype);
6695           gimple_seq stmts = NULL;
6696           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6697             {
6698               int bit_offset;
6699               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6700                                        vec_temp, bitsize, bitsize_zero_node);
6701
6702               /* In SLP we don't need to apply reduction operation, so we just
6703                  collect s' values in SCALAR_RESULTS.  */
6704               if (slp_reduc)
6705                 scalar_results.safe_push (new_temp);
6706
6707               for (bit_offset = element_bitsize;
6708                    bit_offset < vec_size_in_bits;
6709                    bit_offset += element_bitsize)
6710                 {
6711                   tree bitpos = bitsize_int (bit_offset);
6712                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6713                                            compute_type, vec_temp,
6714                                            bitsize, bitpos);
6715                   if (slp_reduc)
6716                     {
6717                       /* In SLP we don't need to apply reduction operation, so
6718                          we just collect s' values in SCALAR_RESULTS.  */
6719                       new_temp = new_name;
6720                       scalar_results.safe_push (new_name);
6721                     }
6722                   else
6723                     new_temp = gimple_build (&stmts, code, compute_type,
6724                                              new_name, new_temp);
6725                 }
6726             }
6727
6728           /* The only case where we need to reduce scalar results in SLP, is
6729              unrolling.  If the size of SCALAR_RESULTS is greater than
6730              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6731              REDUC_GROUP_SIZE.  */
6732           if (slp_reduc)
6733             {
6734               tree res, first_res, new_res;
6735
6736               /* Reduce multiple scalar results in case of SLP unrolling.  */
6737               for (j = group_size; scalar_results.iterate (j, &res);
6738                    j++)
6739                 {
6740                   first_res = scalar_results[j % group_size];
6741                   new_res = gimple_build (&stmts, code, compute_type,
6742                                           first_res, res);
6743                   scalar_results[j % group_size] = new_res;
6744                 }
6745               scalar_results.truncate (group_size);
6746               for (k = 0; k < group_size; k++)
6747                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6748                                                     scalar_results[k]);
6749             }
6750           else
6751             {
6752               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6753               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6754               scalar_results.safe_push (new_temp);
6755             }
6756
6757           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6758         }
6759
6760       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6761           && induc_val)
6762         {
6763           /* Earlier we set the initial value to be a vector if induc_val
6764              values.  Check the result and if it is induc_val then replace
6765              with the original initial value, unless induc_val is
6766              the same as initial_def already.  */
6767           tree zcompare = make_ssa_name (boolean_type_node);
6768           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6769                                              induc_val);
6770           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6771           tree initial_def = reduc_info->reduc_initial_values[0];
6772           tree tmp = make_ssa_name (new_scalar_dest);
6773           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6774                                              initial_def, new_temp);
6775           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6776           scalar_results[0] = tmp;
6777         }
6778     }
6779
6780   /* 2.5 Adjust the final result by the initial value of the reduction
6781          variable. (When such adjustment is not needed, then
6782          'adjustment_def' is zero).  For example, if code is PLUS we create:
6783          new_temp = loop_exit_def + adjustment_def  */
6784
6785   if (adjustment_def)
6786     {
6787       gcc_assert (!slp_reduc);
6788       gimple_seq stmts = NULL;
6789       if (double_reduc)
6790         {
6791           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6792           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6793           new_temp = gimple_build (&stmts, code, vectype,
6794                                    reduc_inputs[0], adjustment_def);
6795         }
6796       else
6797         {
6798           new_temp = scalar_results[0];
6799           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6800           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6801                                            adjustment_def);
6802           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6803           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6804                                    new_temp, adjustment_def);
6805           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6806         }
6807
6808       epilog_stmt = gimple_seq_last_stmt (stmts);
6809       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6810       scalar_results[0] = new_temp;
6811     }
6812
6813   /* Record this operation if it could be reused by the epilogue loop.  */
6814   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6815       && reduc_inputs.length () == 1)
6816     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6817                                            { orig_reduc_input, reduc_info });
6818
6819   if (double_reduc)
6820     loop = outer_loop;
6821
6822   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6823           phis with new adjusted scalar results, i.e., replace use <s_out0>
6824           with use <s_out4>.
6825
6826      Transform:
6827         loop_exit:
6828           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6829           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6830           v_out2 = reduce <v_out1>
6831           s_out3 = extract_field <v_out2, 0>
6832           s_out4 = adjust_result <s_out3>
6833           use <s_out0>
6834           use <s_out0>
6835
6836      into:
6837
6838         loop_exit:
6839           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6840           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6841           v_out2 = reduce <v_out1>
6842           s_out3 = extract_field <v_out2, 0>
6843           s_out4 = adjust_result <s_out3>
6844           use <s_out4>
6845           use <s_out4> */
6846
6847   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6848   for (k = 0; k < live_out_stmts.size (); k++)
6849     {
6850       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6851       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6852
6853       phis.create (3);
6854       /* Find the loop-closed-use at the loop exit of the original scalar
6855          result.  (The reduction result is expected to have two immediate uses,
6856          one at the latch block, and one at the loop exit).  For double
6857          reductions we are looking for exit phis of the outer loop.  */
6858       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6859         {
6860           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6861             {
6862               if (!is_gimple_debug (USE_STMT (use_p)))
6863                 phis.safe_push (USE_STMT (use_p));
6864             }
6865           else
6866             {
6867               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6868                 {
6869                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6870
6871                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6872                     {
6873                       if (!flow_bb_inside_loop_p (loop,
6874                                              gimple_bb (USE_STMT (phi_use_p)))
6875                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6876                         phis.safe_push (USE_STMT (phi_use_p));
6877                     }
6878                 }
6879             }
6880         }
6881
6882       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6883         {
6884           /* Replace the uses:  */
6885           orig_name = PHI_RESULT (exit_phi);
6886
6887           /* Look for a single use at the target of the skip edge.  */
6888           if (unify_with_main_loop_p)
6889             {
6890               use_operand_p use_p;
6891               gimple *user;
6892               if (!single_imm_use (orig_name, &use_p, &user))
6893                 gcc_unreachable ();
6894               orig_name = gimple_get_lhs (user);
6895             }
6896
6897           scalar_result = scalar_results[k];
6898           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6899             {
6900               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6901                 SET_USE (use_p, scalar_result);
6902               update_stmt (use_stmt);
6903             }
6904         }
6905
6906       phis.release ();
6907     }
6908 }
6909
6910 /* Return a vector of type VECTYPE that is equal to the vector select
6911    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6912    before GSI.  */
6913
6914 static tree
6915 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6916                      tree vec, tree identity)
6917 {
6918   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6919   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6920                                           mask, vec, identity);
6921   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6922   return cond;
6923 }
6924
6925 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6926    order, starting with LHS.  Insert the extraction statements before GSI and
6927    associate the new scalar SSA names with variable SCALAR_DEST.
6928    If MASK is nonzero mask the input and then operate on it unconditionally.
6929    Return the SSA name for the result.  */
6930
6931 static tree
6932 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6933                        tree_code code, tree lhs, tree vector_rhs,
6934                        tree mask)
6935 {
6936   tree vectype = TREE_TYPE (vector_rhs);
6937   tree scalar_type = TREE_TYPE (vectype);
6938   tree bitsize = TYPE_SIZE (scalar_type);
6939   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6940   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6941
6942   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6943      to perform an unconditional element-wise reduction of it.  */
6944   if (mask)
6945     {
6946       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6947                                                    "masked_vector_rhs");
6948       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6949                                                   false);
6950       tree vector_identity = build_vector_from_val (vectype, neutral_op);
6951       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6952                                              mask, vector_rhs, vector_identity);
6953       gsi_insert_before (gsi, select, GSI_SAME_STMT);
6954       vector_rhs = masked_vector_rhs;
6955     }
6956
6957   for (unsigned HOST_WIDE_INT bit_offset = 0;
6958        bit_offset < vec_size_in_bits;
6959        bit_offset += element_bitsize)
6960     {
6961       tree bitpos = bitsize_int (bit_offset);
6962       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6963                          bitsize, bitpos);
6964
6965       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6966       rhs = make_ssa_name (scalar_dest, stmt);
6967       gimple_assign_set_lhs (stmt, rhs);
6968       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6969
6970       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6971       tree new_name = make_ssa_name (scalar_dest, stmt);
6972       gimple_assign_set_lhs (stmt, new_name);
6973       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6974       lhs = new_name;
6975     }
6976   return lhs;
6977 }
6978
6979 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6980    type of the vector input.  */
6981
6982 static internal_fn
6983 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6984 {
6985   internal_fn mask_reduc_fn;
6986   internal_fn mask_len_reduc_fn;
6987
6988   switch (reduc_fn)
6989     {
6990     case IFN_FOLD_LEFT_PLUS:
6991       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6992       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6993       break;
6994
6995     default:
6996       return IFN_LAST;
6997     }
6998
6999   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7000                                       OPTIMIZE_FOR_SPEED))
7001     return mask_reduc_fn;
7002   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7003                                       OPTIMIZE_FOR_SPEED))
7004     return mask_len_reduc_fn;
7005   return IFN_LAST;
7006 }
7007
7008 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7009    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7010    statement.  CODE is the operation performed by STMT_INFO and OPS are
7011    its scalar operands.  REDUC_INDEX is the index of the operand in
7012    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7013    implements in-order reduction, or IFN_LAST if we should open-code it.
7014    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7015    that should be used to control the operation in a fully-masked loop.  */
7016
7017 static bool
7018 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7019                                stmt_vec_info stmt_info,
7020                                gimple_stmt_iterator *gsi,
7021                                gimple **vec_stmt, slp_tree slp_node,
7022                                gimple *reduc_def_stmt,
7023                                code_helper code, internal_fn reduc_fn,
7024                                tree *ops, int num_ops, tree vectype_in,
7025                                int reduc_index, vec_loop_masks *masks,
7026                                vec_loop_lens *lens)
7027 {
7028   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7029   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7030   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7031
7032   int ncopies;
7033   if (slp_node)
7034     ncopies = 1;
7035   else
7036     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7037
7038   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7039   gcc_assert (ncopies == 1);
7040
7041   bool is_cond_op = false;
7042   if (!code.is_tree_code ())
7043     {
7044       code = conditional_internal_fn_code (internal_fn (code));
7045       gcc_assert (code != ERROR_MARK);
7046       is_cond_op = true;
7047     }
7048
7049   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7050
7051   if (slp_node)
7052     {
7053       if (is_cond_op)
7054         {
7055           if (dump_enabled_p ())
7056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7057                              "fold-left reduction on SLP not supported.\n");
7058           return false;
7059         }
7060
7061       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7062                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7063     }
7064
7065   /* The operands either come from a binary operation or an IFN_COND operation.
7066      The former is a gimple assign with binary rhs and the latter is a
7067      gimple call with four arguments.  */
7068   gcc_assert (num_ops == 2 || num_ops == 4);
7069   tree op0, opmask;
7070   if (!is_cond_op)
7071     op0 = ops[1 - reduc_index];
7072   else
7073     {
7074       op0 = ops[2];
7075       opmask = ops[0];
7076       gcc_assert (!slp_node);
7077     }
7078
7079   int group_size = 1;
7080   stmt_vec_info scalar_dest_def_info;
7081   auto_vec<tree> vec_oprnds0, vec_opmask;
7082   if (slp_node)
7083     {
7084       auto_vec<vec<tree> > vec_defs (2);
7085       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7086       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7087       vec_defs[0].release ();
7088       vec_defs[1].release ();
7089       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7090       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7091     }
7092   else
7093     {
7094       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7095                                      op0, &vec_oprnds0);
7096       scalar_dest_def_info = stmt_info;
7097
7098       /* For an IFN_COND_OP we also need the vector mask operand.  */
7099       if (is_cond_op)
7100           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7101                                          opmask, &vec_opmask);
7102     }
7103
7104   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7105   tree scalar_dest = gimple_get_lhs (sdef);
7106   tree scalar_type = TREE_TYPE (scalar_dest);
7107   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7108
7109   int vec_num = vec_oprnds0.length ();
7110   gcc_assert (vec_num == 1 || slp_node);
7111   tree vec_elem_type = TREE_TYPE (vectype_out);
7112   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7113
7114   tree vector_identity = NULL_TREE;
7115   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7116     {
7117       vector_identity = build_zero_cst (vectype_out);
7118       if (!HONOR_SIGNED_ZEROS (vectype_out))
7119         ;
7120       else
7121         {
7122           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7123           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7124                                         vector_identity);
7125         }
7126     }
7127
7128   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7129   int i;
7130   tree def0;
7131   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7132     {
7133       gimple *new_stmt;
7134       tree mask = NULL_TREE;
7135       tree len = NULL_TREE;
7136       tree bias = NULL_TREE;
7137       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7138         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7139       else if (is_cond_op)
7140         mask = vec_opmask[0];
7141       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7142         {
7143           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7144                                    i, 1);
7145           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7146           bias = build_int_cst (intQI_type_node, biasval);
7147           if (!is_cond_op)
7148             mask = build_minus_one_cst (truth_type_for (vectype_in));
7149         }
7150
7151       /* Handle MINUS by adding the negative.  */
7152       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7153         {
7154           tree negated = make_ssa_name (vectype_out);
7155           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7156           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7157           def0 = negated;
7158         }
7159
7160       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7161           && mask && mask_reduc_fn == IFN_LAST)
7162         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7163                                     vector_identity);
7164
7165       /* On the first iteration the input is simply the scalar phi
7166          result, and for subsequent iterations it is the output of
7167          the preceding operation.  */
7168       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7169         {
7170           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7171             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7172                                                    def0, mask, len, bias);
7173           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7174             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7175                                                    def0, mask);
7176           else
7177             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7178                                                    def0);
7179           /* For chained SLP reductions the output of the previous reduction
7180              operation serves as the input of the next. For the final statement
7181              the output cannot be a temporary - we reuse the original
7182              scalar destination of the last statement.  */
7183           if (i != vec_num - 1)
7184             {
7185               gimple_set_lhs (new_stmt, scalar_dest_var);
7186               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7187               gimple_set_lhs (new_stmt, reduc_var);
7188             }
7189         }
7190       else
7191         {
7192           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7193                                              tree_code (code), reduc_var, def0,
7194                                              mask);
7195           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7196           /* Remove the statement, so that we can use the same code paths
7197              as for statements that we've just created.  */
7198           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7199           gsi_remove (&tmp_gsi, true);
7200         }
7201
7202       if (i == vec_num - 1)
7203         {
7204           gimple_set_lhs (new_stmt, scalar_dest);
7205           vect_finish_replace_stmt (loop_vinfo,
7206                                     scalar_dest_def_info,
7207                                     new_stmt);
7208         }
7209       else
7210         vect_finish_stmt_generation (loop_vinfo,
7211                                      scalar_dest_def_info,
7212                                      new_stmt, gsi);
7213
7214       if (slp_node)
7215         slp_node->push_vec_def (new_stmt);
7216       else
7217         {
7218           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7219           *vec_stmt = new_stmt;
7220         }
7221     }
7222
7223   return true;
7224 }
7225
7226 /* Function is_nonwrapping_integer_induction.
7227
7228    Check if STMT_VINO (which is part of loop LOOP) both increments and
7229    does not cause overflow.  */
7230
7231 static bool
7232 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7233 {
7234   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7235   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7236   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7237   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7238   widest_int ni, max_loop_value, lhs_max;
7239   wi::overflow_type overflow = wi::OVF_NONE;
7240
7241   /* Make sure the loop is integer based.  */
7242   if (TREE_CODE (base) != INTEGER_CST
7243       || TREE_CODE (step) != INTEGER_CST)
7244     return false;
7245
7246   /* Check that the max size of the loop will not wrap.  */
7247
7248   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7249     return true;
7250
7251   if (! max_stmt_executions (loop, &ni))
7252     return false;
7253
7254   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7255                             &overflow);
7256   if (overflow)
7257     return false;
7258
7259   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7260                             TYPE_SIGN (lhs_type), &overflow);
7261   if (overflow)
7262     return false;
7263
7264   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7265           <= TYPE_PRECISION (lhs_type));
7266 }
7267
7268 /* Check if masking can be supported by inserting a conditional expression.
7269    CODE is the code for the operation.  COND_FN is the conditional internal
7270    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7271 static bool
7272 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7273                          tree vectype_in)
7274 {
7275   if (cond_fn != IFN_LAST
7276       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7277                                          OPTIMIZE_FOR_SPEED))
7278     return false;
7279
7280   if (code.is_tree_code ())
7281     switch (tree_code (code))
7282       {
7283       case DOT_PROD_EXPR:
7284       case SAD_EXPR:
7285         return true;
7286
7287       default:
7288         break;
7289       }
7290   return false;
7291 }
7292
7293 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7294    code for the operation.  VOP is the array of operands.  MASK is the loop
7295    mask.  GSI is a statement iterator used to place the new conditional
7296    expression.  */
7297 static void
7298 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7299                       gimple_stmt_iterator *gsi)
7300 {
7301   switch (tree_code (code))
7302     {
7303     case DOT_PROD_EXPR:
7304       {
7305         tree vectype = TREE_TYPE (vop[1]);
7306         tree zero = build_zero_cst (vectype);
7307         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7308         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7309                                                mask, vop[1], zero);
7310         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7311         vop[1] = masked_op1;
7312         break;
7313       }
7314
7315     case SAD_EXPR:
7316       {
7317         tree vectype = TREE_TYPE (vop[1]);
7318         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7319         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7320                                                mask, vop[1], vop[0]);
7321         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7322         vop[1] = masked_op1;
7323         break;
7324       }
7325
7326     default:
7327       gcc_unreachable ();
7328     }
7329 }
7330
7331 /* Function vectorizable_reduction.
7332
7333    Check if STMT_INFO performs a reduction operation that can be vectorized.
7334    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7335    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7336    Return true if STMT_INFO is vectorizable in this way.
7337
7338    This function also handles reduction idioms (patterns) that have been
7339    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7340    may be of this form:
7341      X = pattern_expr (arg0, arg1, ..., X)
7342    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7343    sequence that had been detected and replaced by the pattern-stmt
7344    (STMT_INFO).
7345
7346    This function also handles reduction of condition expressions, for example:
7347      for (int i = 0; i < N; i++)
7348        if (a[i] < value)
7349          last = a[i];
7350    This is handled by vectorising the loop and creating an additional vector
7351    containing the loop indexes for which "a[i] < value" was true.  In the
7352    function epilogue this is reduced to a single max value and then used to
7353    index into the vector of results.
7354
7355    In some cases of reduction patterns, the type of the reduction variable X is
7356    different than the type of the other arguments of STMT_INFO.
7357    In such cases, the vectype that is used when transforming STMT_INFO into
7358    a vector stmt is different than the vectype that is used to determine the
7359    vectorization factor, because it consists of a different number of elements
7360    than the actual number of elements that are being operated upon in parallel.
7361
7362    For example, consider an accumulation of shorts into an int accumulator.
7363    On some targets it's possible to vectorize this pattern operating on 8
7364    shorts at a time (hence, the vectype for purposes of determining the
7365    vectorization factor should be V8HI); on the other hand, the vectype that
7366    is used to create the vector form is actually V4SI (the type of the result).
7367
7368    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7369    indicates what is the actual level of parallelism (V8HI in the example), so
7370    that the right vectorization factor would be derived.  This vectype
7371    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7372    be used to create the vectorized stmt.  The right vectype for the vectorized
7373    stmt is obtained from the type of the result X:
7374       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7375
7376    This means that, contrary to "regular" reductions (or "regular" stmts in
7377    general), the following equation:
7378       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7379    does *NOT* necessarily hold for reduction patterns.  */
7380
7381 bool
7382 vectorizable_reduction (loop_vec_info loop_vinfo,
7383                         stmt_vec_info stmt_info, slp_tree slp_node,
7384                         slp_instance slp_node_instance,
7385                         stmt_vector_for_cost *cost_vec)
7386 {
7387   tree vectype_in = NULL_TREE;
7388   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7389   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7390   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7391   stmt_vec_info cond_stmt_vinfo = NULL;
7392   int i;
7393   int ncopies;
7394   bool single_defuse_cycle = false;
7395   bool nested_cycle = false;
7396   bool double_reduc = false;
7397   int vec_num;
7398   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7399   tree cond_reduc_val = NULL_TREE;
7400
7401   /* Make sure it was already recognized as a reduction computation.  */
7402   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7403       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7404       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7405     return false;
7406
7407   /* The stmt we store reduction analysis meta on.  */
7408   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7409   reduc_info->is_reduc_info = true;
7410
7411   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7412     {
7413       if (is_a <gphi *> (stmt_info->stmt))
7414         {
7415           if (slp_node)
7416             {
7417               /* We eventually need to set a vector type on invariant
7418                  arguments.  */
7419               unsigned j;
7420               slp_tree child;
7421               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7422                 if (!vect_maybe_update_slp_op_vectype
7423                        (child, SLP_TREE_VECTYPE (slp_node)))
7424                   {
7425                     if (dump_enabled_p ())
7426                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427                                        "incompatible vector types for "
7428                                        "invariants\n");
7429                     return false;
7430                   }
7431             }
7432           /* Analysis for double-reduction is done on the outer
7433              loop PHI, nested cycles have no further restrictions.  */
7434           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7435         }
7436       else
7437         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7438       return true;
7439     }
7440
7441   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7442   stmt_vec_info phi_info = stmt_info;
7443   if (!is_a <gphi *> (stmt_info->stmt))
7444     {
7445       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7446       return true;
7447     }
7448   if (slp_node)
7449     {
7450       slp_node_instance->reduc_phis = slp_node;
7451       /* ???  We're leaving slp_node to point to the PHIs, we only
7452          need it to get at the number of vector stmts which wasn't
7453          yet initialized for the instance root.  */
7454     }
7455   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7456     {
7457       use_operand_p use_p;
7458       gimple *use_stmt;
7459       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7460                                  &use_p, &use_stmt);
7461       gcc_assert (res);
7462       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7463     }
7464
7465   /* PHIs should not participate in patterns.  */
7466   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7467   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7468
7469   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7470      and compute the reduction chain length.  Discover the real
7471      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7472   tree reduc_def
7473     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7474                              loop_latch_edge
7475                                (gimple_bb (reduc_def_phi)->loop_father));
7476   unsigned reduc_chain_length = 0;
7477   bool only_slp_reduc_chain = true;
7478   stmt_info = NULL;
7479   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7480   while (reduc_def != PHI_RESULT (reduc_def_phi))
7481     {
7482       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7483       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7484       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7485         {
7486           if (dump_enabled_p ())
7487             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488                              "reduction chain broken by patterns.\n");
7489           return false;
7490         }
7491       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7492         only_slp_reduc_chain = false;
7493       /* For epilogue generation live members of the chain need
7494          to point back to the PHI via their original stmt for
7495          info_for_reduction to work.  For SLP we need to look at
7496          all lanes here - even though we only will vectorize from
7497          the SLP node with live lane zero the other live lanes also
7498          need to be identified as part of a reduction to be able
7499          to skip code generation for them.  */
7500       if (slp_for_stmt_info)
7501         {
7502           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7503             if (STMT_VINFO_LIVE_P (s))
7504               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7505         }
7506       else if (STMT_VINFO_LIVE_P (vdef))
7507         STMT_VINFO_REDUC_DEF (def) = phi_info;
7508       gimple_match_op op;
7509       if (!gimple_extract_op (vdef->stmt, &op))
7510         {
7511           if (dump_enabled_p ())
7512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7513                              "reduction chain includes unsupported"
7514                              " statement type.\n");
7515           return false;
7516         }
7517       if (CONVERT_EXPR_CODE_P (op.code))
7518         {
7519           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7520             {
7521               if (dump_enabled_p ())
7522                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7523                                  "conversion in the reduction chain.\n");
7524               return false;
7525             }
7526         }
7527       else if (!stmt_info)
7528         /* First non-conversion stmt.  */
7529         stmt_info = vdef;
7530       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7531       reduc_chain_length++;
7532       if (!stmt_info && slp_node)
7533         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7534     }
7535   /* PHIs should not participate in patterns.  */
7536   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7537
7538   if (nested_in_vect_loop_p (loop, stmt_info))
7539     {
7540       loop = loop->inner;
7541       nested_cycle = true;
7542     }
7543
7544   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7545      element.  */
7546   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7547     {
7548       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7549       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7550     }
7551   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7552     gcc_assert (slp_node
7553                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7554
7555   /* 1. Is vectorizable reduction?  */
7556   /* Not supportable if the reduction variable is used in the loop, unless
7557      it's a reduction chain.  */
7558   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7559       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7560     return false;
7561
7562   /* Reductions that are not used even in an enclosing outer-loop,
7563      are expected to be "live" (used out of the loop).  */
7564   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7565       && !STMT_VINFO_LIVE_P (stmt_info))
7566     return false;
7567
7568   /* 2. Has this been recognized as a reduction pattern?
7569
7570      Check if STMT represents a pattern that has been recognized
7571      in earlier analysis stages.  For stmts that represent a pattern,
7572      the STMT_VINFO_RELATED_STMT field records the last stmt in
7573      the original sequence that constitutes the pattern.  */
7574
7575   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7576   if (orig_stmt_info)
7577     {
7578       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7579       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7580     }
7581
7582   /* 3. Check the operands of the operation.  The first operands are defined
7583         inside the loop body. The last operand is the reduction variable,
7584         which is defined by the loop-header-phi.  */
7585
7586   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7587   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7588   gimple_match_op op;
7589   if (!gimple_extract_op (stmt_info->stmt, &op))
7590     gcc_unreachable ();
7591   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7592                             || op.code == WIDEN_SUM_EXPR
7593                             || op.code == SAD_EXPR);
7594
7595   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7596       && !SCALAR_FLOAT_TYPE_P (op.type))
7597     return false;
7598
7599   /* Do not try to vectorize bit-precision reductions.  */
7600   if (!type_has_mode_precision_p (op.type))
7601     return false;
7602
7603   /* For lane-reducing ops we're reducing the number of reduction PHIs
7604      which means the only use of that may be in the lane-reducing operation.  */
7605   if (lane_reduc_code_p
7606       && reduc_chain_length != 1
7607       && !only_slp_reduc_chain)
7608     {
7609       if (dump_enabled_p ())
7610         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7611                          "lane-reducing reduction with extra stmts.\n");
7612       return false;
7613     }
7614
7615   /* All uses but the last are expected to be defined in the loop.
7616      The last use is the reduction variable.  In case of nested cycle this
7617      assumption is not true: we use reduc_index to record the index of the
7618      reduction variable.  */
7619   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7620   /* We need to skip an extra operand for COND_EXPRs with embedded
7621      comparison.  */
7622   unsigned opno_adjust = 0;
7623   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7624     opno_adjust = 1;
7625   for (i = 0; i < (int) op.num_ops; i++)
7626     {
7627       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7628       if (i == 0 && op.code == COND_EXPR)
7629         continue;
7630
7631       stmt_vec_info def_stmt_info;
7632       enum vect_def_type dt;
7633       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7634                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7635                                &vectype_op[i], &def_stmt_info))
7636         {
7637           if (dump_enabled_p ())
7638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7639                              "use not simple.\n");
7640           return false;
7641         }
7642       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7643         continue;
7644
7645       /* For an IFN_COND_OP we might hit the reduction definition operand
7646          twice (once as definition, once as else).  */
7647       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7648         continue;
7649
7650       /* There should be only one cycle def in the stmt, the one
7651          leading to reduc_def.  */
7652       if (VECTORIZABLE_CYCLE_DEF (dt))
7653         return false;
7654
7655       if (!vectype_op[i])
7656         vectype_op[i]
7657           = get_vectype_for_scalar_type (loop_vinfo,
7658                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7659
7660       /* To properly compute ncopies we are interested in the widest
7661          non-reduction input type in case we're looking at a widening
7662          accumulation that we later handle in vect_transform_reduction.  */
7663       if (lane_reduc_code_p
7664           && vectype_op[i]
7665           && (!vectype_in
7666               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7667                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7668         vectype_in = vectype_op[i];
7669
7670       if (op.code == COND_EXPR)
7671         {
7672           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7673           if (dt == vect_constant_def)
7674             {
7675               cond_reduc_dt = dt;
7676               cond_reduc_val = op.ops[i];
7677             }
7678           if (dt == vect_induction_def
7679               && def_stmt_info
7680               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7681             {
7682               cond_reduc_dt = dt;
7683               cond_stmt_vinfo = def_stmt_info;
7684             }
7685         }
7686     }
7687   if (!vectype_in)
7688     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7689   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7690
7691   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7692   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7693   /* If we have a condition reduction, see if we can simplify it further.  */
7694   if (v_reduc_type == COND_REDUCTION)
7695     {
7696       if (slp_node)
7697         return false;
7698
7699       /* When the condition uses the reduction value in the condition, fail.  */
7700       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7701         {
7702           if (dump_enabled_p ())
7703             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7704                              "condition depends on previous iteration\n");
7705           return false;
7706         }
7707
7708       if (reduc_chain_length == 1
7709           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7710                                               OPTIMIZE_FOR_SPEED)
7711               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7712                                                  vectype_in,
7713                                                  OPTIMIZE_FOR_SPEED)))
7714         {
7715           if (dump_enabled_p ())
7716             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7717                              "optimizing condition reduction with"
7718                              " FOLD_EXTRACT_LAST.\n");
7719           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7720         }
7721       else if (cond_reduc_dt == vect_induction_def)
7722         {
7723           tree base
7724             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7725           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7726
7727           gcc_assert (TREE_CODE (base) == INTEGER_CST
7728                       && TREE_CODE (step) == INTEGER_CST);
7729           cond_reduc_val = NULL_TREE;
7730           enum tree_code cond_reduc_op_code = ERROR_MARK;
7731           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7732           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7733             ;
7734           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7735              above base; punt if base is the minimum value of the type for
7736              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7737           else if (tree_int_cst_sgn (step) == -1)
7738             {
7739               cond_reduc_op_code = MIN_EXPR;
7740               if (tree_int_cst_sgn (base) == -1)
7741                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7742               else if (tree_int_cst_lt (base,
7743                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7744                 cond_reduc_val
7745                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7746             }
7747           else
7748             {
7749               cond_reduc_op_code = MAX_EXPR;
7750               if (tree_int_cst_sgn (base) == 1)
7751                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7752               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7753                                         base))
7754                 cond_reduc_val
7755                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7756             }
7757           if (cond_reduc_val)
7758             {
7759               if (dump_enabled_p ())
7760                 dump_printf_loc (MSG_NOTE, vect_location,
7761                                  "condition expression based on "
7762                                  "integer induction.\n");
7763               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7764               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7765                 = cond_reduc_val;
7766               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7767             }
7768         }
7769       else if (cond_reduc_dt == vect_constant_def)
7770         {
7771           enum vect_def_type cond_initial_dt;
7772           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7773           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7774           if (cond_initial_dt == vect_constant_def
7775               && types_compatible_p (TREE_TYPE (cond_initial_val),
7776                                      TREE_TYPE (cond_reduc_val)))
7777             {
7778               tree e = fold_binary (LE_EXPR, boolean_type_node,
7779                                     cond_initial_val, cond_reduc_val);
7780               if (e && (integer_onep (e) || integer_zerop (e)))
7781                 {
7782                   if (dump_enabled_p ())
7783                     dump_printf_loc (MSG_NOTE, vect_location,
7784                                      "condition expression based on "
7785                                      "compile time constant.\n");
7786                   /* Record reduction code at analysis stage.  */
7787                   STMT_VINFO_REDUC_CODE (reduc_info)
7788                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7789                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7790                 }
7791             }
7792         }
7793     }
7794
7795   if (STMT_VINFO_LIVE_P (phi_info))
7796     return false;
7797
7798   if (slp_node)
7799     ncopies = 1;
7800   else
7801     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7802
7803   gcc_assert (ncopies >= 1);
7804
7805   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7806
7807   if (nested_cycle)
7808     {
7809       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7810                   == vect_double_reduction_def);
7811       double_reduc = true;
7812     }
7813
7814   /* 4.2. Check support for the epilog operation.
7815
7816           If STMT represents a reduction pattern, then the type of the
7817           reduction variable may be different than the type of the rest
7818           of the arguments.  For example, consider the case of accumulation
7819           of shorts into an int accumulator; The original code:
7820                         S1: int_a = (int) short_a;
7821           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7822
7823           was replaced with:
7824                         STMT: int_acc = widen_sum <short_a, int_acc>
7825
7826           This means that:
7827           1. The tree-code that is used to create the vector operation in the
7828              epilog code (that reduces the partial results) is not the
7829              tree-code of STMT, but is rather the tree-code of the original
7830              stmt from the pattern that STMT is replacing.  I.e, in the example
7831              above we want to use 'widen_sum' in the loop, but 'plus' in the
7832              epilog.
7833           2. The type (mode) we use to check available target support
7834              for the vector operation to be created in the *epilog*, is
7835              determined by the type of the reduction variable (in the example
7836              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7837              However the type (mode) we use to check available target support
7838              for the vector operation to be created *inside the loop*, is
7839              determined by the type of the other arguments to STMT (in the
7840              example we'd check this: optab_handler (widen_sum_optab,
7841              vect_short_mode)).
7842
7843           This is contrary to "regular" reductions, in which the types of all
7844           the arguments are the same as the type of the reduction variable.
7845           For "regular" reductions we can therefore use the same vector type
7846           (and also the same tree-code) when generating the epilog code and
7847           when generating the code inside the loop.  */
7848
7849   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7850
7851   /* If conversion might have created a conditional operation like
7852      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7853   if (orig_code.is_internal_fn ())
7854     {
7855       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7856       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7857     }
7858
7859   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7860
7861   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7862   if (reduction_type == TREE_CODE_REDUCTION)
7863     {
7864       /* Check whether it's ok to change the order of the computation.
7865          Generally, when vectorizing a reduction we change the order of the
7866          computation.  This may change the behavior of the program in some
7867          cases, so we need to check that this is ok.  One exception is when
7868          vectorizing an outer-loop: the inner-loop is executed sequentially,
7869          and therefore vectorizing reductions in the inner-loop during
7870          outer-loop vectorization is safe.  Likewise when we are vectorizing
7871          a series of reductions using SLP and the VF is one the reductions
7872          are performed in scalar order.  */
7873       if (slp_node
7874           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7875           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7876         ;
7877       else if (needs_fold_left_reduction_p (op.type, orig_code))
7878         {
7879           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7880              is not directy used in stmt.  */
7881           if (!only_slp_reduc_chain
7882               && reduc_chain_length != 1)
7883             {
7884               if (dump_enabled_p ())
7885                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886                                  "in-order reduction chain without SLP.\n");
7887               return false;
7888             }
7889           STMT_VINFO_REDUC_TYPE (reduc_info)
7890             = reduction_type = FOLD_LEFT_REDUCTION;
7891         }
7892       else if (!commutative_binary_op_p (orig_code, op.type)
7893                || !associative_binary_op_p (orig_code, op.type))
7894         {
7895           if (dump_enabled_p ())
7896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897                             "reduction: not commutative/associative\n");
7898           return false;
7899         }
7900     }
7901
7902   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7903       && ncopies > 1)
7904     {
7905       if (dump_enabled_p ())
7906         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7907                          "multiple types in double reduction or condition "
7908                          "reduction or fold-left reduction.\n");
7909       return false;
7910     }
7911
7912   internal_fn reduc_fn = IFN_LAST;
7913   if (reduction_type == TREE_CODE_REDUCTION
7914       || reduction_type == FOLD_LEFT_REDUCTION
7915       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7916       || reduction_type == CONST_COND_REDUCTION)
7917     {
7918       if (reduction_type == FOLD_LEFT_REDUCTION
7919           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7920           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7921         {
7922           if (reduc_fn != IFN_LAST
7923               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7924                                                   OPTIMIZE_FOR_SPEED))
7925             {
7926               if (dump_enabled_p ())
7927                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928                                  "reduc op not supported by target.\n");
7929
7930               reduc_fn = IFN_LAST;
7931             }
7932         }
7933       else
7934         {
7935           if (!nested_cycle || double_reduc)
7936             {
7937               if (dump_enabled_p ())
7938                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7939                                  "no reduc code for scalar code.\n");
7940
7941               return false;
7942             }
7943         }
7944     }
7945   else if (reduction_type == COND_REDUCTION)
7946     {
7947       int scalar_precision
7948         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7949       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7950       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7951                                                 vectype_out);
7952
7953       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7954                                           OPTIMIZE_FOR_SPEED))
7955         reduc_fn = IFN_REDUC_MAX;
7956     }
7957   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7958
7959   if (reduction_type != EXTRACT_LAST_REDUCTION
7960       && (!nested_cycle || double_reduc)
7961       && reduc_fn == IFN_LAST
7962       && !nunits_out.is_constant ())
7963     {
7964       if (dump_enabled_p ())
7965         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                          "missing target support for reduction on"
7967                          " variable-length vectors.\n");
7968       return false;
7969     }
7970
7971   /* For SLP reductions, see if there is a neutral value we can use.  */
7972   tree neutral_op = NULL_TREE;
7973   if (slp_node)
7974     {
7975       tree initial_value = NULL_TREE;
7976       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7977         initial_value = vect_phi_initial_value (reduc_def_phi);
7978       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7979                                              orig_code, initial_value);
7980     }
7981
7982   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7983     {
7984       /* We can't support in-order reductions of code such as this:
7985
7986            for (int i = 0; i < n1; ++i)
7987              for (int j = 0; j < n2; ++j)
7988                l += a[j];
7989
7990          since GCC effectively transforms the loop when vectorizing:
7991
7992            for (int i = 0; i < n1 / VF; ++i)
7993              for (int j = 0; j < n2; ++j)
7994                for (int k = 0; k < VF; ++k)
7995                  l += a[j];
7996
7997          which is a reassociation of the original operation.  */
7998       if (dump_enabled_p ())
7999         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8000                          "in-order double reduction not supported.\n");
8001
8002       return false;
8003     }
8004
8005   if (reduction_type == FOLD_LEFT_REDUCTION
8006       && slp_node
8007       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8008     {
8009       /* We cannot use in-order reductions in this case because there is
8010          an implicit reassociation of the operations involved.  */
8011       if (dump_enabled_p ())
8012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8013                          "in-order unchained SLP reductions not supported.\n");
8014       return false;
8015     }
8016
8017   /* For double reductions, and for SLP reductions with a neutral value,
8018      we construct a variable-length initial vector by loading a vector
8019      full of the neutral value and then shift-and-inserting the start
8020      values into the low-numbered elements.  */
8021   if ((double_reduc || neutral_op)
8022       && !nunits_out.is_constant ()
8023       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8024                                           vectype_out, OPTIMIZE_FOR_SPEED))
8025     {
8026       if (dump_enabled_p ())
8027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028                          "reduction on variable-length vectors requires"
8029                          " target support for a vector-shift-and-insert"
8030                          " operation.\n");
8031       return false;
8032     }
8033
8034   /* Check extra constraints for variable-length unchained SLP reductions.  */
8035   if (slp_node
8036       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8037       && !nunits_out.is_constant ())
8038     {
8039       /* We checked above that we could build the initial vector when
8040          there's a neutral element value.  Check here for the case in
8041          which each SLP statement has its own initial value and in which
8042          that value needs to be repeated for every instance of the
8043          statement within the initial vector.  */
8044       unsigned int group_size = SLP_TREE_LANES (slp_node);
8045       if (!neutral_op
8046           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8047                                               TREE_TYPE (vectype_out)))
8048         {
8049           if (dump_enabled_p ())
8050             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051                              "unsupported form of SLP reduction for"
8052                              " variable-length vectors: cannot build"
8053                              " initial vector.\n");
8054           return false;
8055         }
8056       /* The epilogue code relies on the number of elements being a multiple
8057          of the group size.  The duplicate-and-interleave approach to setting
8058          up the initial vector does too.  */
8059       if (!multiple_p (nunits_out, group_size))
8060         {
8061           if (dump_enabled_p ())
8062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063                              "unsupported form of SLP reduction for"
8064                              " variable-length vectors: the vector size"
8065                              " is not a multiple of the number of results.\n");
8066           return false;
8067         }
8068     }
8069
8070   if (reduction_type == COND_REDUCTION)
8071     {
8072       widest_int ni;
8073
8074       if (! max_loop_iterations (loop, &ni))
8075         {
8076           if (dump_enabled_p ())
8077             dump_printf_loc (MSG_NOTE, vect_location,
8078                              "loop count not known, cannot create cond "
8079                              "reduction.\n");
8080           return false;
8081         }
8082       /* Convert backedges to iterations.  */
8083       ni += 1;
8084
8085       /* The additional index will be the same type as the condition.  Check
8086          that the loop can fit into this less one (because we'll use up the
8087          zero slot for when there are no matches).  */
8088       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8089       if (wi::geu_p (ni, wi::to_widest (max_index)))
8090         {
8091           if (dump_enabled_p ())
8092             dump_printf_loc (MSG_NOTE, vect_location,
8093                              "loop size is greater than data size.\n");
8094           return false;
8095         }
8096     }
8097
8098   /* In case the vectorization factor (VF) is bigger than the number
8099      of elements that we can fit in a vectype (nunits), we have to generate
8100      more than one vector stmt - i.e - we need to "unroll" the
8101      vector stmt by a factor VF/nunits.  For more details see documentation
8102      in vectorizable_operation.  */
8103
8104   /* If the reduction is used in an outer loop we need to generate
8105      VF intermediate results, like so (e.g. for ncopies=2):
8106         r0 = phi (init, r0)
8107         r1 = phi (init, r1)
8108         r0 = x0 + r0;
8109         r1 = x1 + r1;
8110     (i.e. we generate VF results in 2 registers).
8111     In this case we have a separate def-use cycle for each copy, and therefore
8112     for each copy we get the vector def for the reduction variable from the
8113     respective phi node created for this copy.
8114
8115     Otherwise (the reduction is unused in the loop nest), we can combine
8116     together intermediate results, like so (e.g. for ncopies=2):
8117         r = phi (init, r)
8118         r = x0 + r;
8119         r = x1 + r;
8120    (i.e. we generate VF/2 results in a single register).
8121    In this case for each copy we get the vector def for the reduction variable
8122    from the vectorized reduction operation generated in the previous iteration.
8123
8124    This only works when we see both the reduction PHI and its only consumer
8125    in vectorizable_reduction and there are no intermediate stmts
8126    participating.  When unrolling we want each unrolled iteration to have its
8127    own reduction accumulator since one of the main goals of unrolling a
8128    reduction is to reduce the aggregate loop-carried latency.  */
8129   if (ncopies > 1
8130       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8131       && reduc_chain_length == 1
8132       && loop_vinfo->suggested_unroll_factor == 1)
8133     single_defuse_cycle = true;
8134
8135   if (single_defuse_cycle || lane_reduc_code_p)
8136     {
8137       gcc_assert (op.code != COND_EXPR);
8138
8139       /* 4. Supportable by target?  */
8140       bool ok = true;
8141
8142       /* 4.1. check support for the operation in the loop
8143
8144          This isn't necessary for the lane reduction codes, since they
8145          can only be produced by pattern matching, and it's up to the
8146          pattern matcher to test for support.  The main reason for
8147          specifically skipping this step is to avoid rechecking whether
8148          mixed-sign dot-products can be implemented using signed
8149          dot-products.  */
8150       machine_mode vec_mode = TYPE_MODE (vectype_in);
8151       if (!lane_reduc_code_p
8152           && !directly_supported_p (op.code, vectype_in, optab_vector))
8153         {
8154           if (dump_enabled_p ())
8155             dump_printf (MSG_NOTE, "op not supported by target.\n");
8156           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8157               || !vect_can_vectorize_without_simd_p (op.code))
8158             ok = false;
8159           else
8160             if (dump_enabled_p ())
8161               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8162         }
8163
8164       if (vect_emulated_vector_p (vectype_in)
8165           && !vect_can_vectorize_without_simd_p (op.code))
8166         {
8167           if (dump_enabled_p ())
8168             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8169           return false;
8170         }
8171
8172       /* lane-reducing operations have to go through vect_transform_reduction.
8173          For the other cases try without the single cycle optimization.  */
8174       if (!ok)
8175         {
8176           if (lane_reduc_code_p)
8177             return false;
8178           else
8179             single_defuse_cycle = false;
8180         }
8181     }
8182   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8183
8184   /* If the reduction stmt is one of the patterns that have lane
8185      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8186   if ((ncopies > 1 && ! single_defuse_cycle)
8187       && lane_reduc_code_p)
8188     {
8189       if (dump_enabled_p ())
8190         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8191                          "multi def-use cycle not possible for lane-reducing "
8192                          "reduction operation\n");
8193       return false;
8194     }
8195
8196   if (slp_node
8197       && !(!single_defuse_cycle
8198            && !lane_reduc_code_p
8199            && reduction_type != FOLD_LEFT_REDUCTION))
8200     for (i = 0; i < (int) op.num_ops; i++)
8201       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8202         {
8203           if (dump_enabled_p ())
8204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8205                              "incompatible vector types for invariants\n");
8206           return false;
8207         }
8208
8209   if (slp_node)
8210     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8211   else
8212     vec_num = 1;
8213
8214   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8215                              reduction_type, ncopies, cost_vec);
8216   /* Cost the reduction op inside the loop if transformed via
8217      vect_transform_reduction.  Otherwise this is costed by the
8218      separate vectorizable_* routines.  */
8219   if (single_defuse_cycle || lane_reduc_code_p)
8220     {
8221       int factor = 1;
8222       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8223         /* Three dot-products and a subtraction.  */
8224         factor = 4;
8225       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8226                         stmt_info, 0, vect_body);
8227     }
8228
8229   if (dump_enabled_p ()
8230       && reduction_type == FOLD_LEFT_REDUCTION)
8231     dump_printf_loc (MSG_NOTE, vect_location,
8232                      "using an in-order (fold-left) reduction.\n");
8233   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8234   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8235      reductions go through their own vectorizable_* routines.  */
8236   if (!single_defuse_cycle
8237       && !lane_reduc_code_p
8238       && reduction_type != FOLD_LEFT_REDUCTION)
8239     {
8240       stmt_vec_info tem
8241         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8242       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8243         {
8244           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8245           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8246         }
8247       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8248       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8249     }
8250   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8251     {
8252       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8253       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8254       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8255
8256       if (reduction_type != FOLD_LEFT_REDUCTION
8257           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8258           && (cond_fn == IFN_LAST
8259               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8260                                                   OPTIMIZE_FOR_SPEED)))
8261         {
8262           if (dump_enabled_p ())
8263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8264                              "can't operate on partial vectors because"
8265                              " no conditional operation is available.\n");
8266           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8267         }
8268       else if (reduction_type == FOLD_LEFT_REDUCTION
8269                && reduc_fn == IFN_LAST
8270                && !expand_vec_cond_expr_p (vectype_in,
8271                                            truth_type_for (vectype_in),
8272                                            SSA_NAME))
8273         {
8274           if (dump_enabled_p ())
8275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8276                              "can't operate on partial vectors because"
8277                              " no conditional operation is available.\n");
8278           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8279         }
8280       else if (reduction_type == FOLD_LEFT_REDUCTION
8281                && internal_fn_mask_index (reduc_fn) == -1
8282                && FLOAT_TYPE_P (vectype_in)
8283                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8284         {
8285           if (dump_enabled_p ())
8286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8287                              "can't operate on partial vectors because"
8288                              " signed zeros cannot be preserved.\n");
8289           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8290         }
8291       else
8292         {
8293           internal_fn mask_reduc_fn
8294             = get_masked_reduction_fn (reduc_fn, vectype_in);
8295
8296           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8297             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8298                                   vectype_in, 1);
8299           else
8300             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8301                                    vectype_in, NULL);
8302         }
8303     }
8304   return true;
8305 }
8306
8307 /* STMT_INFO is a dot-product reduction whose multiplication operands
8308    have different signs.  Emit a sequence to emulate the operation
8309    using a series of signed DOT_PROD_EXPRs and return the last
8310    statement generated.  VEC_DEST is the result of the vector operation
8311    and VOP lists its inputs.  */
8312
8313 static gassign *
8314 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8315                              gimple_stmt_iterator *gsi, tree vec_dest,
8316                              tree vop[3])
8317 {
8318   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8319   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8320   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8321   gimple *new_stmt;
8322
8323   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8324   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8325     std::swap (vop[0], vop[1]);
8326
8327   /* Convert all inputs to signed types.  */
8328   for (int i = 0; i < 3; ++i)
8329     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8330       {
8331         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8332         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8333         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8334         vop[i] = tmp;
8335       }
8336
8337   /* In the comments below we assume 8-bit inputs for simplicity,
8338      but the approach works for any full integer type.  */
8339
8340   /* Create a vector of -128.  */
8341   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8342   tree min_narrow = build_vector_from_val (narrow_vectype,
8343                                            min_narrow_elttype);
8344
8345   /* Create a vector of 64.  */
8346   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8347   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8348   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8349
8350   /* Emit: SUB_RES = VOP[0] - 128.  */
8351   tree sub_res = make_ssa_name (narrow_vectype);
8352   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8353   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8354
8355   /* Emit:
8356
8357        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8358        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8359        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8360
8361      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8362      Doing the two 64 * y steps first allows more time to compute x.  */
8363   tree stage1 = make_ssa_name (wide_vectype);
8364   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8365                                   vop[1], half_narrow, vop[2]);
8366   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8367
8368   tree stage2 = make_ssa_name (wide_vectype);
8369   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8370                                   vop[1], half_narrow, stage1);
8371   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8372
8373   tree stage3 = make_ssa_name (wide_vectype);
8374   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8375                                   sub_res, vop[1], stage2);
8376   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8377
8378   /* Convert STAGE3 to the reduction type.  */
8379   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8380 }
8381
8382 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8383    value.  */
8384
8385 bool
8386 vect_transform_reduction (loop_vec_info loop_vinfo,
8387                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8388                           gimple **vec_stmt, slp_tree slp_node)
8389 {
8390   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8391   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8392   int i;
8393   int ncopies;
8394   int vec_num;
8395
8396   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8397   gcc_assert (reduc_info->is_reduc_info);
8398
8399   if (nested_in_vect_loop_p (loop, stmt_info))
8400     {
8401       loop = loop->inner;
8402       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8403     }
8404
8405   gimple_match_op op;
8406   if (!gimple_extract_op (stmt_info->stmt, &op))
8407     gcc_unreachable ();
8408
8409   /* All uses but the last are expected to be defined in the loop.
8410      The last use is the reduction variable.  In case of nested cycle this
8411      assumption is not true: we use reduc_index to record the index of the
8412      reduction variable.  */
8413   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8414   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8415   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8416   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8417
8418   if (slp_node)
8419     {
8420       ncopies = 1;
8421       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8422     }
8423   else
8424     {
8425       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8426       vec_num = 1;
8427     }
8428
8429   code_helper code = canonicalize_code (op.code, op.type);
8430   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8431
8432   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8433   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8434   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8435
8436   /* Transform.  */
8437   tree new_temp = NULL_TREE;
8438   auto_vec<tree> vec_oprnds0;
8439   auto_vec<tree> vec_oprnds1;
8440   auto_vec<tree> vec_oprnds2;
8441   tree def0;
8442
8443   if (dump_enabled_p ())
8444     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8445
8446   /* FORNOW: Multiple types are not supported for condition.  */
8447   if (code == COND_EXPR)
8448     gcc_assert (ncopies == 1);
8449
8450   /* A binary COND_OP reduction must have the same definition and else
8451      value. */
8452   bool cond_fn_p = code.is_internal_fn ()
8453     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8454   if (cond_fn_p)
8455     {
8456       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8457                   || code == IFN_COND_MUL || code == IFN_COND_AND
8458                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8459       gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8460     }
8461
8462   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8463
8464   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8465   if (reduction_type == FOLD_LEFT_REDUCTION)
8466     {
8467       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8468       gcc_assert (code.is_tree_code () || cond_fn_p);
8469       return vectorize_fold_left_reduction
8470           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8471            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8472            reduc_index, masks, lens);
8473     }
8474
8475   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8476   gcc_assert (single_defuse_cycle
8477               || code == DOT_PROD_EXPR
8478               || code == WIDEN_SUM_EXPR
8479               || code == SAD_EXPR);
8480
8481   /* Create the destination vector  */
8482   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8483   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8484
8485   /* Get NCOPIES vector definitions for all operands except the reduction
8486      definition.  */
8487   if (!cond_fn_p)
8488     {
8489       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8490                          single_defuse_cycle && reduc_index == 0
8491                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8492                          single_defuse_cycle && reduc_index == 1
8493                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8494                          op.num_ops == 3
8495                          && !(single_defuse_cycle && reduc_index == 2)
8496                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8497     }
8498   else
8499     {
8500       /* For a conditional operation pass the truth type as mask
8501          vectype.  */
8502       gcc_assert (single_defuse_cycle && reduc_index == 1);
8503       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8504                          op.ops[0], &vec_oprnds0,
8505                          truth_type_for (vectype_in),
8506                          NULL_TREE, &vec_oprnds1, NULL_TREE,
8507                          op.ops[2], &vec_oprnds2, NULL_TREE);
8508     }
8509
8510   /* For single def-use cycles get one copy of the vectorized reduction
8511      definition.  */
8512   if (single_defuse_cycle)
8513     {
8514       gcc_assert (!slp_node);
8515       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8516                                      op.ops[reduc_index],
8517                                      reduc_index == 0 ? &vec_oprnds0
8518                                      : (reduc_index == 1 ? &vec_oprnds1
8519                                         : &vec_oprnds2));
8520     }
8521
8522   bool emulated_mixed_dot_prod
8523     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8524   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8525     {
8526       gimple *new_stmt;
8527       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8528       if (masked_loop_p && !mask_by_cond_expr)
8529         {
8530           /* No conditional ifns have been defined for dot-product yet.  */
8531           gcc_assert (code != DOT_PROD_EXPR);
8532
8533           /* Make sure that the reduction accumulator is vop[0].  */
8534           if (reduc_index == 1)
8535             {
8536               gcc_assert (commutative_binary_op_p (code, op.type));
8537               std::swap (vop[0], vop[1]);
8538             }
8539           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8540                                           vec_num * ncopies, vectype_in, i);
8541           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8542                                                     vop[0], vop[1], vop[0]);
8543           new_temp = make_ssa_name (vec_dest, call);
8544           gimple_call_set_lhs (call, new_temp);
8545           gimple_call_set_nothrow (call, true);
8546           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8547           new_stmt = call;
8548         }
8549       else
8550         {
8551           if (op.num_ops >= 3)
8552             vop[2] = vec_oprnds2[i];
8553
8554           if (masked_loop_p && mask_by_cond_expr)
8555             {
8556               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8557                                               vec_num * ncopies, vectype_in, i);
8558               build_vect_cond_expr (code, vop, mask, gsi);
8559             }
8560
8561           if (emulated_mixed_dot_prod)
8562             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8563                                                     vec_dest, vop);
8564
8565           else if (code.is_internal_fn () && !cond_fn_p)
8566             new_stmt = gimple_build_call_internal (internal_fn (code),
8567                                                    op.num_ops,
8568                                                    vop[0], vop[1], vop[2]);
8569           else if (code.is_internal_fn () && cond_fn_p)
8570             new_stmt = gimple_build_call_internal (internal_fn (code),
8571                                                    op.num_ops,
8572                                                    vop[0], vop[1], vop[2],
8573                                                    vop[1]);
8574           else
8575             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8576                                             vop[0], vop[1], vop[2]);
8577           new_temp = make_ssa_name (vec_dest, new_stmt);
8578           gimple_set_lhs (new_stmt, new_temp);
8579           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8580         }
8581
8582       if (slp_node)
8583         slp_node->push_vec_def (new_stmt);
8584       else if (single_defuse_cycle
8585                && i < ncopies - 1)
8586         {
8587           if (reduc_index == 0)
8588             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8589           else if (reduc_index == 1)
8590             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8591           else if (reduc_index == 2)
8592             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8593         }
8594       else
8595         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8596     }
8597
8598   if (!slp_node)
8599     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8600
8601   return true;
8602 }
8603
8604 /* Transform phase of a cycle PHI.  */
8605
8606 bool
8607 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8608                           stmt_vec_info stmt_info, gimple **vec_stmt,
8609                           slp_tree slp_node, slp_instance slp_node_instance)
8610 {
8611   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8612   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8613   int i;
8614   int ncopies;
8615   int j;
8616   bool nested_cycle = false;
8617   int vec_num;
8618
8619   if (nested_in_vect_loop_p (loop, stmt_info))
8620     {
8621       loop = loop->inner;
8622       nested_cycle = true;
8623     }
8624
8625   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8626   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8627   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8628   gcc_assert (reduc_info->is_reduc_info);
8629
8630   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8631       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8632     /* Leave the scalar phi in place.  */
8633     return true;
8634
8635   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8636   /* For a nested cycle we do not fill the above.  */
8637   if (!vectype_in)
8638     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8639   gcc_assert (vectype_in);
8640
8641   if (slp_node)
8642     {
8643       /* The size vect_schedule_slp_instance computes is off for us.  */
8644       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8645                                       * SLP_TREE_LANES (slp_node), vectype_in);
8646       ncopies = 1;
8647     }
8648   else
8649     {
8650       vec_num = 1;
8651       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8652     }
8653
8654   /* Check whether we should use a single PHI node and accumulate
8655      vectors to one before the backedge.  */
8656   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8657     ncopies = 1;
8658
8659   /* Create the destination vector  */
8660   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8661   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8662                                                vectype_out);
8663
8664   /* Get the loop-entry arguments.  */
8665   tree vec_initial_def = NULL_TREE;
8666   auto_vec<tree> vec_initial_defs;
8667   if (slp_node)
8668     {
8669       vec_initial_defs.reserve (vec_num);
8670       if (nested_cycle)
8671         {
8672           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8673           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8674                              &vec_initial_defs);
8675         }
8676       else
8677         {
8678           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8679           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8680           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8681
8682           unsigned int num_phis = stmts.length ();
8683           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8684             num_phis = 1;
8685           initial_values.reserve (num_phis);
8686           for (unsigned int i = 0; i < num_phis; ++i)
8687             {
8688               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8689               initial_values.quick_push (vect_phi_initial_value (this_phi));
8690             }
8691           if (vec_num == 1)
8692             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8693           if (!initial_values.is_empty ())
8694             {
8695               tree initial_value
8696                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8697               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8698               tree neutral_op
8699                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8700                                             code, initial_value);
8701               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8702                                               &vec_initial_defs, vec_num,
8703                                               stmts.length (), neutral_op);
8704             }
8705         }
8706     }
8707   else
8708     {
8709       /* Get at the scalar def before the loop, that defines the initial
8710          value of the reduction variable.  */
8711       tree initial_def = vect_phi_initial_value (phi);
8712       reduc_info->reduc_initial_values.safe_push (initial_def);
8713       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8714          and we can't use zero for induc_val, use initial_def.  Similarly
8715          for REDUC_MIN and initial_def larger than the base.  */
8716       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8717         {
8718           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8719           if (TREE_CODE (initial_def) == INTEGER_CST
8720               && !integer_zerop (induc_val)
8721               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8722                    && tree_int_cst_lt (initial_def, induc_val))
8723                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8724                       && tree_int_cst_lt (induc_val, initial_def))))
8725             {
8726               induc_val = initial_def;
8727               /* Communicate we used the initial_def to epilouge
8728                  generation.  */
8729               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8730             }
8731           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8732         }
8733       else if (nested_cycle)
8734         {
8735           /* Do not use an adjustment def as that case is not supported
8736              correctly if ncopies is not one.  */
8737           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8738                                          ncopies, initial_def,
8739                                          &vec_initial_defs);
8740         }
8741       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8742                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8743         /* Fill the initial vector with the initial scalar value.  */
8744         vec_initial_def
8745           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8746                                            initial_def, initial_def);
8747       else
8748         {
8749           if (ncopies == 1)
8750             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8751           if (!reduc_info->reduc_initial_values.is_empty ())
8752             {
8753               initial_def = reduc_info->reduc_initial_values[0];
8754               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8755               tree neutral_op
8756                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8757                                             code, initial_def);
8758               gcc_assert (neutral_op);
8759               /* Try to simplify the vector initialization by applying an
8760                  adjustment after the reduction has been performed.  */
8761               if (!reduc_info->reused_accumulator
8762                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8763                   && !operand_equal_p (neutral_op, initial_def))
8764                 {
8765                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8766                     = initial_def;
8767                   initial_def = neutral_op;
8768                 }
8769               vec_initial_def
8770                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8771                                                  initial_def, neutral_op);
8772             }
8773         }
8774     }
8775
8776   if (vec_initial_def)
8777     {
8778       vec_initial_defs.create (ncopies);
8779       for (i = 0; i < ncopies; ++i)
8780         vec_initial_defs.quick_push (vec_initial_def);
8781     }
8782
8783   if (auto *accumulator = reduc_info->reused_accumulator)
8784     {
8785       tree def = accumulator->reduc_input;
8786       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8787         {
8788           unsigned int nreduc;
8789           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8790                                             (TREE_TYPE (def)),
8791                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8792                                           &nreduc);
8793           gcc_assert (res);
8794           gimple_seq stmts = NULL;
8795           /* Reduce the single vector to a smaller one.  */
8796           if (nreduc != 1)
8797             {
8798               /* Perform the reduction in the appropriate type.  */
8799               tree rvectype = vectype_out;
8800               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8801                                               TREE_TYPE (TREE_TYPE (def))))
8802                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8803                                               TYPE_VECTOR_SUBPARTS
8804                                                 (vectype_out));
8805               def = vect_create_partial_epilog (def, rvectype,
8806                                                 STMT_VINFO_REDUC_CODE
8807                                                   (reduc_info),
8808                                                 &stmts);
8809             }
8810           /* The epilogue loop might use a different vector mode, like
8811              VNx2DI vs. V2DI.  */
8812           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8813             {
8814               tree reduc_type = build_vector_type_for_mode
8815                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8816               def = gimple_convert (&stmts, reduc_type, def);
8817             }
8818           /* Adjust the input so we pick up the partially reduced value
8819              for the skip edge in vect_create_epilog_for_reduction.  */
8820           accumulator->reduc_input = def;
8821           /* And the reduction could be carried out using a different sign.  */
8822           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8823             def = gimple_convert (&stmts, vectype_out, def);
8824           if (loop_vinfo->main_loop_edge)
8825             {
8826               /* While we'd like to insert on the edge this will split
8827                  blocks and disturb bookkeeping, we also will eventually
8828                  need this on the skip edge.  Rely on sinking to
8829                  fixup optimal placement and insert in the pred.  */
8830               gimple_stmt_iterator gsi
8831                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8832               /* Insert before a cond that eventually skips the
8833                  epilogue.  */
8834               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8835                 gsi_prev (&gsi);
8836               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8837             }
8838           else
8839             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8840                                               stmts);
8841         }
8842       if (loop_vinfo->main_loop_edge)
8843         vec_initial_defs[0]
8844           = vect_get_main_loop_result (loop_vinfo, def,
8845                                        vec_initial_defs[0]);
8846       else
8847         vec_initial_defs.safe_push (def);
8848     }
8849
8850   /* Generate the reduction PHIs upfront.  */
8851   for (i = 0; i < vec_num; i++)
8852     {
8853       tree vec_init_def = vec_initial_defs[i];
8854       for (j = 0; j < ncopies; j++)
8855         {
8856           /* Create the reduction-phi that defines the reduction
8857              operand.  */
8858           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8859
8860           /* Set the loop-entry arg of the reduction-phi.  */
8861           if (j != 0 && nested_cycle)
8862             vec_init_def = vec_initial_defs[j];
8863           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8864                        UNKNOWN_LOCATION);
8865
8866           /* The loop-latch arg is set in epilogue processing.  */
8867
8868           if (slp_node)
8869             slp_node->push_vec_def (new_phi);
8870           else
8871             {
8872               if (j == 0)
8873                 *vec_stmt = new_phi;
8874               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8875             }
8876         }
8877     }
8878
8879   return true;
8880 }
8881
8882 /* Vectorizes LC PHIs.  */
8883
8884 bool
8885 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8886                      stmt_vec_info stmt_info, gimple **vec_stmt,
8887                      slp_tree slp_node)
8888 {
8889   if (!loop_vinfo
8890       || !is_a <gphi *> (stmt_info->stmt)
8891       || gimple_phi_num_args (stmt_info->stmt) != 1)
8892     return false;
8893
8894   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8895       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8896     return false;
8897
8898   if (!vec_stmt) /* transformation not required.  */
8899     {
8900       /* Deal with copies from externs or constants that disguise as
8901          loop-closed PHI nodes (PR97886).  */
8902       if (slp_node
8903           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8904                                                 SLP_TREE_VECTYPE (slp_node)))
8905         {
8906           if (dump_enabled_p ())
8907             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8908                              "incompatible vector types for invariants\n");
8909           return false;
8910         }
8911       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8912       return true;
8913     }
8914
8915   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8916   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8917   basic_block bb = gimple_bb (stmt_info->stmt);
8918   edge e = single_pred_edge (bb);
8919   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8920   auto_vec<tree> vec_oprnds;
8921   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8922                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8923                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8924   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8925     {
8926       /* Create the vectorized LC PHI node.  */
8927       gphi *new_phi = create_phi_node (vec_dest, bb);
8928       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8929       if (slp_node)
8930         slp_node->push_vec_def (new_phi);
8931       else
8932         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8933     }
8934   if (!slp_node)
8935     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8936
8937   return true;
8938 }
8939
8940 /* Vectorizes PHIs.  */
8941
8942 bool
8943 vectorizable_phi (vec_info *,
8944                   stmt_vec_info stmt_info, gimple **vec_stmt,
8945                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8946 {
8947   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8948     return false;
8949
8950   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8951     return false;
8952
8953   tree vectype = SLP_TREE_VECTYPE (slp_node);
8954
8955   if (!vec_stmt) /* transformation not required.  */
8956     {
8957       slp_tree child;
8958       unsigned i;
8959       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8960         if (!child)
8961           {
8962             if (dump_enabled_p ())
8963               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8964                                "PHI node with unvectorized backedge def\n");
8965             return false;
8966           }
8967         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8968           {
8969             if (dump_enabled_p ())
8970               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8971                                "incompatible vector types for invariants\n");
8972             return false;
8973           }
8974         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8975                  && !useless_type_conversion_p (vectype,
8976                                                 SLP_TREE_VECTYPE (child)))
8977           {
8978             /* With bools we can have mask and non-mask precision vectors
8979                or different non-mask precisions.  while pattern recog is
8980                supposed to guarantee consistency here bugs in it can cause
8981                mismatches (PR103489 and PR103800 for example).
8982                Deal with them here instead of ICEing later.  */
8983             if (dump_enabled_p ())
8984               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8985                                "incompatible vector type setup from "
8986                                "bool pattern detection\n");
8987             return false;
8988           }
8989
8990       /* For single-argument PHIs assume coalescing which means zero cost
8991          for the scalar and the vector PHIs.  This avoids artificially
8992          favoring the vector path (but may pessimize it in some cases).  */
8993       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8994         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8995                           vector_stmt, stmt_info, vectype, 0, vect_body);
8996       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8997       return true;
8998     }
8999
9000   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9001   basic_block bb = gimple_bb (stmt_info->stmt);
9002   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9003   auto_vec<gphi *> new_phis;
9004   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9005     {
9006       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9007
9008       /* Skip not yet vectorized defs.  */
9009       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9010           && SLP_TREE_VEC_DEFS (child).is_empty ())
9011         continue;
9012
9013       auto_vec<tree> vec_oprnds;
9014       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9015       if (!new_phis.exists ())
9016         {
9017           new_phis.create (vec_oprnds.length ());
9018           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9019             {
9020               /* Create the vectorized LC PHI node.  */
9021               new_phis.quick_push (create_phi_node (vec_dest, bb));
9022               slp_node->push_vec_def (new_phis[j]);
9023             }
9024         }
9025       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9026       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9027         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9028     }
9029   /* We should have at least one already vectorized child.  */
9030   gcc_assert (new_phis.exists ());
9031
9032   return true;
9033 }
9034
9035 /* Vectorizes first order recurrences.  An overview of the transformation
9036    is described below. Suppose we have the following loop.
9037
9038      int t = 0;
9039      for (int i = 0; i < n; ++i)
9040        {
9041          b[i] = a[i] - t;
9042          t = a[i];
9043        }
9044
9045    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9046    looks (simplified) like:
9047
9048     scalar.preheader:
9049       init = 0;
9050
9051     scalar.body:
9052       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9053       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9054       _1 = a[i]
9055       b[i] = _1 - _2
9056       if (i < n) goto scalar.body
9057
9058    In this example, _2 is a recurrence because it's value depends on the
9059    previous iteration.  We vectorize this as (VF = 4)
9060
9061     vector.preheader:
9062       vect_init = vect_cst(..., ..., ..., 0)
9063
9064     vector.body
9065       i = PHI <0(vector.preheader), i+4(vector.body)>
9066       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9067       vect_2 = a[i, i+1, i+2, i+3];
9068       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9069       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9070       if (..) goto vector.body
9071
9072    In this function, vectorizable_recurr, we code generate both the
9073    vector PHI node and the permute since those together compute the
9074    vectorized value of the scalar PHI.  We do not yet have the
9075    backedge value to fill in there nor into the vec_perm.  Those
9076    are filled in maybe_set_vectorized_backedge_value and
9077    vect_schedule_scc.
9078
9079    TODO:  Since the scalar loop does not have a use of the recurrence
9080    outside of the loop the natural way to implement peeling via
9081    vectorizing the live value doesn't work.  For now peeling of loops
9082    with a recurrence is not implemented.  For SLP the supported cases
9083    are restricted to those requiring a single vector recurrence PHI.  */
9084
9085 bool
9086 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9087                      gimple **vec_stmt, slp_tree slp_node,
9088                      stmt_vector_for_cost *cost_vec)
9089 {
9090   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9091     return false;
9092
9093   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9094
9095   /* So far we only support first-order recurrence auto-vectorization.  */
9096   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9097     return false;
9098
9099   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9100   unsigned ncopies;
9101   if (slp_node)
9102     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9103   else
9104     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9105   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9106   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9107   /* We need to be able to make progress with a single vector.  */
9108   if (maybe_gt (dist * 2, nunits))
9109     {
9110       if (dump_enabled_p ())
9111         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9112                          "first order recurrence exceeds half of "
9113                          "a vector\n");
9114       return false;
9115     }
9116
9117   /* First-order recurrence autovectorization needs to handle permutation
9118      with indices = [nunits-1, nunits, nunits+1, ...].  */
9119   vec_perm_builder sel (nunits, 1, 3);
9120   for (int i = 0; i < 3; ++i)
9121     sel.quick_push (nunits - dist + i);
9122   vec_perm_indices indices (sel, 2, nunits);
9123
9124   if (!vec_stmt) /* transformation not required.  */
9125     {
9126       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9127                                  indices))
9128         return false;
9129
9130       if (slp_node)
9131         {
9132           /* We eventually need to set a vector type on invariant
9133              arguments.  */
9134           unsigned j;
9135           slp_tree child;
9136           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9137             if (!vect_maybe_update_slp_op_vectype
9138                   (child, SLP_TREE_VECTYPE (slp_node)))
9139               {
9140                 if (dump_enabled_p ())
9141                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9142                                    "incompatible vector types for "
9143                                    "invariants\n");
9144                 return false;
9145               }
9146         }
9147       /* The recurrence costs the initialization vector and one permute
9148          for each copy.  */
9149       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9150                                                  stmt_info, 0, vect_prologue);
9151       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9152                                                stmt_info, 0, vect_body);
9153       if (dump_enabled_p ())
9154         dump_printf_loc (MSG_NOTE, vect_location,
9155                          "vectorizable_recurr: inside_cost = %d, "
9156                          "prologue_cost = %d .\n", inside_cost,
9157                          prologue_cost);
9158
9159       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9160       return true;
9161     }
9162
9163   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9164   basic_block bb = gimple_bb (phi);
9165   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9166   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9167     {
9168       gimple_seq stmts = NULL;
9169       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9170       gsi_insert_seq_on_edge_immediate (pe, stmts);
9171     }
9172   tree vec_init = build_vector_from_val (vectype, preheader);
9173   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9174
9175   /* Create the vectorized first-order PHI node.  */
9176   tree vec_dest = vect_get_new_vect_var (vectype,
9177                                          vect_simple_var, "vec_recur_");
9178   gphi *new_phi = create_phi_node (vec_dest, bb);
9179   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9180
9181   /* Insert shuffles the first-order recurrence autovectorization.
9182        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9183   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9184
9185   /* Insert the required permute after the latch definition.  The
9186      second and later operands are tentative and will be updated when we have
9187      vectorized the latch definition.  */
9188   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9189   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9190   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9191   gsi_next (&gsi2);
9192
9193   for (unsigned i = 0; i < ncopies; ++i)
9194     {
9195       vec_dest = make_ssa_name (vectype);
9196       gassign *vperm
9197           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9198                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9199                                  NULL, perm);
9200       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9201
9202       if (slp_node)
9203         slp_node->push_vec_def (vperm);
9204       else
9205         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9206     }
9207
9208   if (!slp_node)
9209     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9210   return true;
9211 }
9212
9213 /* Return true if VECTYPE represents a vector that requires lowering
9214    by the vector lowering pass.  */
9215
9216 bool
9217 vect_emulated_vector_p (tree vectype)
9218 {
9219   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9220           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9221               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9222 }
9223
9224 /* Return true if we can emulate CODE on an integer mode representation
9225    of a vector.  */
9226
9227 bool
9228 vect_can_vectorize_without_simd_p (tree_code code)
9229 {
9230   switch (code)
9231     {
9232     case PLUS_EXPR:
9233     case MINUS_EXPR:
9234     case NEGATE_EXPR:
9235     case BIT_AND_EXPR:
9236     case BIT_IOR_EXPR:
9237     case BIT_XOR_EXPR:
9238     case BIT_NOT_EXPR:
9239       return true;
9240
9241     default:
9242       return false;
9243     }
9244 }
9245
9246 /* Likewise, but taking a code_helper.  */
9247
9248 bool
9249 vect_can_vectorize_without_simd_p (code_helper code)
9250 {
9251   return (code.is_tree_code ()
9252           && vect_can_vectorize_without_simd_p (tree_code (code)));
9253 }
9254
9255 /* Create vector init for vectorized iv.  */
9256 static tree
9257 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9258                                tree step_expr, poly_uint64 nunits,
9259                                tree vectype,
9260                                enum vect_induction_op_type induction_type)
9261 {
9262   unsigned HOST_WIDE_INT const_nunits;
9263   tree vec_shift, vec_init, new_name;
9264   unsigned i;
9265   tree itype = TREE_TYPE (vectype);
9266
9267   /* iv_loop is the loop to be vectorized. Create:
9268      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9269   new_name = gimple_convert (stmts, itype, init_expr);
9270   switch (induction_type)
9271     {
9272     case vect_step_op_shr:
9273     case vect_step_op_shl:
9274       /* Build the Initial value from shift_expr.  */
9275       vec_init = gimple_build_vector_from_val (stmts,
9276                                                vectype,
9277                                                new_name);
9278       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9279                                 build_zero_cst (itype), step_expr);
9280       vec_init = gimple_build (stmts,
9281                                (induction_type == vect_step_op_shr
9282                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9283                                vectype, vec_init, vec_shift);
9284       break;
9285
9286     case vect_step_op_neg:
9287       {
9288         vec_init = gimple_build_vector_from_val (stmts,
9289                                                  vectype,
9290                                                  new_name);
9291         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9292                                      vectype, vec_init);
9293         /* The encoding has 2 interleaved stepped patterns.  */
9294         vec_perm_builder sel (nunits, 2, 3);
9295         sel.quick_grow (6);
9296         for (i = 0; i < 3; i++)
9297           {
9298             sel[2 * i] = i;
9299             sel[2 * i + 1] = i + nunits;
9300           }
9301         vec_perm_indices indices (sel, 2, nunits);
9302         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9303            fail when vec_init is const vector. In that situation vec_perm is not
9304            really needed.  */
9305         tree perm_mask_even
9306           = vect_gen_perm_mask_any (vectype, indices);
9307         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9308                                  vectype,
9309                                  vec_init, vec_neg,
9310                                  perm_mask_even);
9311       }
9312       break;
9313
9314     case vect_step_op_mul:
9315       {
9316         /* Use unsigned mult to avoid UD integer overflow.  */
9317         gcc_assert (nunits.is_constant (&const_nunits));
9318         tree utype = unsigned_type_for (itype);
9319         tree uvectype = build_vector_type (utype,
9320                                            TYPE_VECTOR_SUBPARTS (vectype));
9321         new_name = gimple_convert (stmts, utype, new_name);
9322         vec_init = gimple_build_vector_from_val (stmts,
9323                                                  uvectype,
9324                                                  new_name);
9325         tree_vector_builder elts (uvectype, const_nunits, 1);
9326         tree elt_step = build_one_cst (utype);
9327
9328         elts.quick_push (elt_step);
9329         for (i = 1; i < const_nunits; i++)
9330           {
9331             /* Create: new_name_i = new_name + step_expr.  */
9332             elt_step = gimple_build (stmts, MULT_EXPR,
9333                                      utype, elt_step, step_expr);
9334             elts.quick_push (elt_step);
9335           }
9336         /* Create a vector from [new_name_0, new_name_1, ...,
9337            new_name_nunits-1].  */
9338         tree vec_mul = gimple_build_vector (stmts, &elts);
9339         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9340                                  vec_init, vec_mul);
9341         vec_init = gimple_convert (stmts, vectype, vec_init);
9342       }
9343       break;
9344
9345     default:
9346       gcc_unreachable ();
9347     }
9348
9349   return vec_init;
9350 }
9351
9352 /* Peel init_expr by skip_niter for induction_type.  */
9353 tree
9354 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9355                              tree skip_niters, tree step_expr,
9356                              enum vect_induction_op_type induction_type)
9357 {
9358   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9359   tree type = TREE_TYPE (init_expr);
9360   unsigned prec = TYPE_PRECISION (type);
9361   switch (induction_type)
9362     {
9363     case vect_step_op_neg:
9364       if (TREE_INT_CST_LOW (skip_niters) % 2)
9365         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9366       /* else no change.  */
9367       break;
9368
9369     case vect_step_op_shr:
9370     case vect_step_op_shl:
9371       skip_niters = gimple_convert (stmts, type, skip_niters);
9372       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9373       /* When shift mount >= precision, need to avoid UD.
9374          In the original loop, there's no UD, and according to semantic,
9375          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9376       if (!tree_fits_uhwi_p (step_expr)
9377           || tree_to_uhwi (step_expr) >= prec)
9378         {
9379           if (induction_type == vect_step_op_shl
9380               || TYPE_UNSIGNED (type))
9381             init_expr = build_zero_cst (type);
9382           else
9383             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9384                                       init_expr,
9385                                       wide_int_to_tree (type, prec - 1));
9386         }
9387       else
9388         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9389                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9390                                   type, init_expr, step_expr);
9391       break;
9392
9393     case vect_step_op_mul:
9394       {
9395         tree utype = unsigned_type_for (type);
9396         init_expr = gimple_convert (stmts, utype, init_expr);
9397         wide_int skipn = wi::to_wide (skip_niters);
9398         wide_int begin = wi::to_wide (step_expr);
9399         auto_mpz base, exp, mod, res;
9400         wi::to_mpz (begin, base, TYPE_SIGN (type));
9401         wi::to_mpz (skipn, exp, UNSIGNED);
9402         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9403         mpz_powm (res, base, exp, mod);
9404         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9405         tree mult_expr = wide_int_to_tree (utype, begin);
9406         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9407                                   init_expr, mult_expr);
9408         init_expr = gimple_convert (stmts, type, init_expr);
9409       }
9410       break;
9411
9412     default:
9413       gcc_unreachable ();
9414     }
9415
9416   return init_expr;
9417 }
9418
9419 /* Create vector step for vectorized iv.  */
9420 static tree
9421 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9422                                poly_uint64 vf,
9423                                enum vect_induction_op_type induction_type)
9424 {
9425   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9426   tree new_name = NULL;
9427   /* Step should be pow (step, vf) for mult induction.  */
9428   if (induction_type == vect_step_op_mul)
9429     {
9430       gcc_assert (vf.is_constant ());
9431       wide_int begin = wi::to_wide (step_expr);
9432
9433       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9434         begin = wi::mul (begin, wi::to_wide (step_expr));
9435
9436       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9437     }
9438   else if (induction_type == vect_step_op_neg)
9439     /* Do nothing.  */
9440     ;
9441   else
9442     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9443                              expr, step_expr);
9444   return new_name;
9445 }
9446
9447 static tree
9448 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9449                                    stmt_vec_info stmt_info,
9450                                    tree new_name, tree vectype,
9451                                    enum vect_induction_op_type induction_type)
9452 {
9453   /* No step is needed for neg induction.  */
9454   if (induction_type == vect_step_op_neg)
9455     return NULL;
9456
9457   tree t = unshare_expr (new_name);
9458   gcc_assert (CONSTANT_CLASS_P (new_name)
9459               || TREE_CODE (new_name) == SSA_NAME);
9460   tree new_vec = build_vector_from_val (vectype, t);
9461   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9462                                     new_vec, vectype, NULL);
9463   return vec_step;
9464 }
9465
9466 /* Update vectorized iv with vect_step, induc_def is init.  */
9467 static tree
9468 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9469                           tree induc_def, tree vec_step,
9470                           enum vect_induction_op_type induction_type)
9471 {
9472   tree vec_def = induc_def;
9473   switch (induction_type)
9474     {
9475     case vect_step_op_mul:
9476       {
9477         /* Use unsigned mult to avoid UD integer overflow.  */
9478         tree uvectype
9479           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9480                                TYPE_VECTOR_SUBPARTS (vectype));
9481         vec_def = gimple_convert (stmts, uvectype, vec_def);
9482         vec_step = gimple_convert (stmts, uvectype, vec_step);
9483         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9484                                 vec_def, vec_step);
9485         vec_def = gimple_convert (stmts, vectype, vec_def);
9486       }
9487       break;
9488
9489     case vect_step_op_shr:
9490       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9491                               vec_def, vec_step);
9492       break;
9493
9494     case vect_step_op_shl:
9495       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9496                               vec_def, vec_step);
9497       break;
9498     case vect_step_op_neg:
9499       vec_def = induc_def;
9500       /* Do nothing.  */
9501       break;
9502     default:
9503       gcc_unreachable ();
9504     }
9505
9506   return vec_def;
9507
9508 }
9509
9510 /* Function vectorizable_induction
9511
9512    Check if STMT_INFO performs an nonlinear induction computation that can be
9513    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9514    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9515    basic block.
9516    Return true if STMT_INFO is vectorizable in this way.  */
9517
9518 static bool
9519 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9520                                   stmt_vec_info stmt_info,
9521                                   gimple **vec_stmt, slp_tree slp_node,
9522                                   stmt_vector_for_cost *cost_vec)
9523 {
9524   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9525   unsigned ncopies;
9526   bool nested_in_vect_loop = false;
9527   class loop *iv_loop;
9528   tree vec_def;
9529   edge pe = loop_preheader_edge (loop);
9530   basic_block new_bb;
9531   tree vec_init, vec_step;
9532   tree new_name;
9533   gimple *new_stmt;
9534   gphi *induction_phi;
9535   tree induc_def, vec_dest;
9536   tree init_expr, step_expr;
9537   tree niters_skip;
9538   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9539   unsigned i;
9540   gimple_stmt_iterator si;
9541
9542   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9543
9544   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9545   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9546   enum vect_induction_op_type induction_type
9547     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9548
9549   gcc_assert (induction_type > vect_step_op_add);
9550
9551   if (slp_node)
9552     ncopies = 1;
9553   else
9554     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9555   gcc_assert (ncopies >= 1);
9556
9557   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9558   if (nested_in_vect_loop_p (loop, stmt_info))
9559     {
9560       if (dump_enabled_p ())
9561         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9562                          "nonlinear induction in nested loop.\n");
9563       return false;
9564     }
9565
9566   iv_loop = loop;
9567   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9568
9569   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9570      update for each iv and a permutation to generate wanted vector iv.  */
9571   if (slp_node)
9572     {
9573       if (dump_enabled_p ())
9574         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9575                          "SLP induction not supported for nonlinear"
9576                          " induction.\n");
9577       return false;
9578     }
9579
9580   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9581     {
9582       if (dump_enabled_p ())
9583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9584                          "floating point nonlinear induction vectorization"
9585                          " not supported.\n");
9586       return false;
9587     }
9588
9589   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9590   init_expr = vect_phi_initial_value (phi);
9591   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9592               && TREE_CODE (step_expr) == INTEGER_CST);
9593   /* step_expr should be aligned with init_expr,
9594      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9595   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9596
9597   if (TREE_CODE (init_expr) == INTEGER_CST)
9598     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9599   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9600     {
9601       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9602       if (dump_enabled_p ())
9603         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9604                          "nonlinear induction vectorization failed:"
9605                          " component type of vectype is not a nop conversion"
9606                          " from type of init_expr.\n");
9607       return false;
9608     }
9609
9610   switch (induction_type)
9611     {
9612     case vect_step_op_neg:
9613       if (TREE_CODE (init_expr) != INTEGER_CST
9614           && TREE_CODE (init_expr) != REAL_CST)
9615         {
9616           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9617           if (!directly_supported_p (NEGATE_EXPR, vectype))
9618             return false;
9619
9620           /* The encoding has 2 interleaved stepped patterns.  */
9621           vec_perm_builder sel (nunits, 2, 3);
9622           machine_mode mode = TYPE_MODE (vectype);
9623           sel.quick_grow (6);
9624           for (i = 0; i < 3; i++)
9625             {
9626               sel[i * 2] = i;
9627               sel[i * 2 + 1] = i + nunits;
9628             }
9629           vec_perm_indices indices (sel, 2, nunits);
9630           if (!can_vec_perm_const_p (mode, mode, indices))
9631             return false;
9632         }
9633       break;
9634
9635     case vect_step_op_mul:
9636       {
9637         /* Check for backend support of MULT_EXPR.  */
9638         if (!directly_supported_p (MULT_EXPR, vectype))
9639           return false;
9640
9641         /* ?? How to construct vector step for variable number vector.
9642            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9643         if (!vf.is_constant ())
9644           return false;
9645       }
9646       break;
9647
9648     case vect_step_op_shr:
9649       /* Check for backend support of RSHIFT_EXPR.  */
9650       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9651         return false;
9652
9653       /* Don't shift more than type precision to avoid UD.  */
9654       if (!tree_fits_uhwi_p (step_expr)
9655           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9656                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9657         return false;
9658       break;
9659
9660     case vect_step_op_shl:
9661       /* Check for backend support of RSHIFT_EXPR.  */
9662       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9663         return false;
9664
9665       /* Don't shift more than type precision to avoid UD.  */
9666       if (!tree_fits_uhwi_p (step_expr)
9667           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9668                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9669         return false;
9670
9671       break;
9672
9673     default:
9674       gcc_unreachable ();
9675     }
9676
9677   if (!vec_stmt) /* transformation not required.  */
9678     {
9679       unsigned inside_cost = 0, prologue_cost = 0;
9680       /* loop cost for vec_loop. Neg induction doesn't have any
9681          inside_cost.  */
9682       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9683                                       stmt_info, 0, vect_body);
9684
9685       /* loop cost for vec_loop. Neg induction doesn't have any
9686          inside_cost.  */
9687       if (induction_type == vect_step_op_neg)
9688         inside_cost = 0;
9689
9690       /* prologue cost for vec_init and vec_step.  */
9691       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9692                                         stmt_info, 0, vect_prologue);
9693
9694       if (dump_enabled_p ())
9695         dump_printf_loc (MSG_NOTE, vect_location,
9696                          "vect_model_induction_cost: inside_cost = %d, "
9697                          "prologue_cost = %d. \n", inside_cost,
9698                          prologue_cost);
9699
9700       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9701       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9702       return true;
9703     }
9704
9705   /* Transform.  */
9706
9707   /* Compute a vector variable, initialized with the first VF values of
9708      the induction variable.  E.g., for an iv with IV_PHI='X' and
9709      evolution S, for a vector of 4 units, we want to compute:
9710      [X, X + S, X + 2*S, X + 3*S].  */
9711
9712   if (dump_enabled_p ())
9713     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9714
9715   pe = loop_preheader_edge (iv_loop);
9716   /* Find the first insertion point in the BB.  */
9717   basic_block bb = gimple_bb (phi);
9718   si = gsi_after_labels (bb);
9719
9720   gimple_seq stmts = NULL;
9721
9722   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9723   /* If we are using the loop mask to "peel" for alignment then we need
9724      to adjust the start value here.  */
9725   if (niters_skip != NULL_TREE)
9726     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9727                                              step_expr, induction_type);
9728
9729   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9730                                             step_expr, nunits, vectype,
9731                                             induction_type);
9732   if (stmts)
9733     {
9734       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9735       gcc_assert (!new_bb);
9736     }
9737
9738   stmts = NULL;
9739   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9740                                             vf, induction_type);
9741   if (stmts)
9742     {
9743       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9744       gcc_assert (!new_bb);
9745     }
9746
9747   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9748                                                 new_name, vectype,
9749                                                 induction_type);
9750   /* Create the following def-use cycle:
9751      loop prolog:
9752      vec_init = ...
9753      vec_step = ...
9754      loop:
9755      vec_iv = PHI <vec_init, vec_loop>
9756      ...
9757      STMT
9758      ...
9759      vec_loop = vec_iv + vec_step;  */
9760
9761   /* Create the induction-phi that defines the induction-operand.  */
9762   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9763   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9764   induc_def = PHI_RESULT (induction_phi);
9765
9766   /* Create the iv update inside the loop.  */
9767   stmts = NULL;
9768   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9769                                       induc_def, vec_step,
9770                                       induction_type);
9771
9772   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9773   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9774
9775   /* Set the arguments of the phi node:  */
9776   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9777   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9778                UNKNOWN_LOCATION);
9779
9780   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9781   *vec_stmt = induction_phi;
9782
9783   /* In case that vectorization factor (VF) is bigger than the number
9784      of elements that we can fit in a vectype (nunits), we have to generate
9785      more than one vector stmt - i.e - we need to "unroll" the
9786      vector stmt by a factor VF/nunits.  For more details see documentation
9787      in vectorizable_operation.  */
9788
9789   if (ncopies > 1)
9790     {
9791       stmts = NULL;
9792       /* FORNOW. This restriction should be relaxed.  */
9793       gcc_assert (!nested_in_vect_loop);
9794
9795       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9796                                                 nunits, induction_type);
9797
9798       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9799                                                     new_name, vectype,
9800                                                     induction_type);
9801       vec_def = induc_def;
9802       for (i = 1; i < ncopies; i++)
9803         {
9804           /* vec_i = vec_prev + vec_step.  */
9805           stmts = NULL;
9806           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9807                                               vec_def, vec_step,
9808                                               induction_type);
9809           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9810           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9811           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9812         }
9813     }
9814
9815   if (dump_enabled_p ())
9816     dump_printf_loc (MSG_NOTE, vect_location,
9817                      "transform induction: created def-use cycle: %G%G",
9818                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9819
9820   return true;
9821 }
9822
9823 /* Function vectorizable_induction
9824
9825    Check if STMT_INFO performs an induction computation that can be vectorized.
9826    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9827    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9828    Return true if STMT_INFO is vectorizable in this way.  */
9829
9830 bool
9831 vectorizable_induction (loop_vec_info loop_vinfo,
9832                         stmt_vec_info stmt_info,
9833                         gimple **vec_stmt, slp_tree slp_node,
9834                         stmt_vector_for_cost *cost_vec)
9835 {
9836   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9837   unsigned ncopies;
9838   bool nested_in_vect_loop = false;
9839   class loop *iv_loop;
9840   tree vec_def;
9841   edge pe = loop_preheader_edge (loop);
9842   basic_block new_bb;
9843   tree new_vec, vec_init, vec_step, t;
9844   tree new_name;
9845   gimple *new_stmt;
9846   gphi *induction_phi;
9847   tree induc_def, vec_dest;
9848   tree init_expr, step_expr;
9849   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9850   unsigned i;
9851   tree expr;
9852   gimple_stmt_iterator si;
9853   enum vect_induction_op_type induction_type
9854     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9855
9856   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9857   if (!phi)
9858     return false;
9859
9860   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9861     return false;
9862
9863   /* Make sure it was recognized as induction computation.  */
9864   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9865     return false;
9866
9867   /* Handle nonlinear induction in a separate place.  */
9868   if (induction_type != vect_step_op_add)
9869     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9870                                              vec_stmt, slp_node, cost_vec);
9871
9872   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9873   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9874
9875   if (slp_node)
9876     ncopies = 1;
9877   else
9878     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9879   gcc_assert (ncopies >= 1);
9880
9881   /* FORNOW. These restrictions should be relaxed.  */
9882   if (nested_in_vect_loop_p (loop, stmt_info))
9883     {
9884       imm_use_iterator imm_iter;
9885       use_operand_p use_p;
9886       gimple *exit_phi;
9887       edge latch_e;
9888       tree loop_arg;
9889
9890       if (ncopies > 1)
9891         {
9892           if (dump_enabled_p ())
9893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9894                              "multiple types in nested loop.\n");
9895           return false;
9896         }
9897
9898       exit_phi = NULL;
9899       latch_e = loop_latch_edge (loop->inner);
9900       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9901       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9902         {
9903           gimple *use_stmt = USE_STMT (use_p);
9904           if (is_gimple_debug (use_stmt))
9905             continue;
9906
9907           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9908             {
9909               exit_phi = use_stmt;
9910               break;
9911             }
9912         }
9913       if (exit_phi)
9914         {
9915           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9916           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9917                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9918             {
9919               if (dump_enabled_p ())
9920                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9921                                  "inner-loop induction only used outside "
9922                                  "of the outer vectorized loop.\n");
9923               return false;
9924             }
9925         }
9926
9927       nested_in_vect_loop = true;
9928       iv_loop = loop->inner;
9929     }
9930   else
9931     iv_loop = loop;
9932   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9933
9934   if (slp_node && !nunits.is_constant ())
9935     {
9936       /* The current SLP code creates the step value element-by-element.  */
9937       if (dump_enabled_p ())
9938         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9939                          "SLP induction not supported for variable-length"
9940                          " vectors.\n");
9941       return false;
9942     }
9943
9944   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9945     {
9946       if (dump_enabled_p ())
9947         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9948                          "floating point induction vectorization disabled\n");
9949       return false;
9950     }
9951
9952   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9953   gcc_assert (step_expr != NULL_TREE);
9954   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9955
9956   /* Check for backend support of PLUS/MINUS_EXPR. */
9957   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9958       || !directly_supported_p (MINUS_EXPR, step_vectype))
9959     return false;
9960
9961   if (!vec_stmt) /* transformation not required.  */
9962     {
9963       unsigned inside_cost = 0, prologue_cost = 0;
9964       if (slp_node)
9965         {
9966           /* We eventually need to set a vector type on invariant
9967              arguments.  */
9968           unsigned j;
9969           slp_tree child;
9970           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9971             if (!vect_maybe_update_slp_op_vectype
9972                 (child, SLP_TREE_VECTYPE (slp_node)))
9973               {
9974                 if (dump_enabled_p ())
9975                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9976                                    "incompatible vector types for "
9977                                    "invariants\n");
9978                 return false;
9979               }
9980           /* loop cost for vec_loop.  */
9981           inside_cost
9982             = record_stmt_cost (cost_vec,
9983                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9984                                 vector_stmt, stmt_info, 0, vect_body);
9985           /* prologue cost for vec_init (if not nested) and step.  */
9986           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9987                                             scalar_to_vec,
9988                                             stmt_info, 0, vect_prologue);
9989         }
9990       else /* if (!slp_node) */
9991         {
9992           /* loop cost for vec_loop.  */
9993           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9994                                           stmt_info, 0, vect_body);
9995           /* prologue cost for vec_init and vec_step.  */
9996           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9997                                             stmt_info, 0, vect_prologue);
9998         }
9999       if (dump_enabled_p ())
10000         dump_printf_loc (MSG_NOTE, vect_location,
10001                          "vect_model_induction_cost: inside_cost = %d, "
10002                          "prologue_cost = %d .\n", inside_cost,
10003                          prologue_cost);
10004
10005       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10006       DUMP_VECT_SCOPE ("vectorizable_induction");
10007       return true;
10008     }
10009
10010   /* Transform.  */
10011
10012   /* Compute a vector variable, initialized with the first VF values of
10013      the induction variable.  E.g., for an iv with IV_PHI='X' and
10014      evolution S, for a vector of 4 units, we want to compute:
10015      [X, X + S, X + 2*S, X + 3*S].  */
10016
10017   if (dump_enabled_p ())
10018     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10019
10020   pe = loop_preheader_edge (iv_loop);
10021   /* Find the first insertion point in the BB.  */
10022   basic_block bb = gimple_bb (phi);
10023   si = gsi_after_labels (bb);
10024
10025   /* For SLP induction we have to generate several IVs as for example
10026      with group size 3 we need
10027        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10028        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10029   if (slp_node)
10030     {
10031       /* Enforced above.  */
10032       unsigned int const_nunits = nunits.to_constant ();
10033
10034       /* The initial values are vectorized, but any lanes > group_size
10035          need adjustment.  */
10036       slp_tree init_node
10037         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10038
10039       /* Gather steps.  Since we do not vectorize inductions as
10040          cycles we have to reconstruct the step from SCEV data.  */
10041       unsigned group_size = SLP_TREE_LANES (slp_node);
10042       tree *steps = XALLOCAVEC (tree, group_size);
10043       tree *inits = XALLOCAVEC (tree, group_size);
10044       stmt_vec_info phi_info;
10045       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10046         {
10047           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10048           if (!init_node)
10049             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10050                                            pe->dest_idx);
10051         }
10052
10053       /* Now generate the IVs.  */
10054       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10055       gcc_assert ((const_nunits * nvects) % group_size == 0);
10056       unsigned nivs;
10057       if (nested_in_vect_loop)
10058         nivs = nvects;
10059       else
10060         {
10061           /* Compute the number of distinct IVs we need.  First reduce
10062              group_size if it is a multiple of const_nunits so we get
10063              one IV for a group_size of 4 but const_nunits 2.  */
10064           unsigned group_sizep = group_size;
10065           if (group_sizep % const_nunits == 0)
10066             group_sizep = group_sizep / const_nunits;
10067           nivs = least_common_multiple (group_sizep,
10068                                         const_nunits) / const_nunits;
10069         }
10070       tree stept = TREE_TYPE (step_vectype);
10071       tree lupdate_mul = NULL_TREE;
10072       if (!nested_in_vect_loop)
10073         {
10074           /* The number of iterations covered in one vector iteration.  */
10075           unsigned lup_mul = (nvects * const_nunits) / group_size;
10076           lupdate_mul
10077             = build_vector_from_val (step_vectype,
10078                                      SCALAR_FLOAT_TYPE_P (stept)
10079                                      ? build_real_from_wide (stept, lup_mul,
10080                                                              UNSIGNED)
10081                                      : build_int_cstu (stept, lup_mul));
10082         }
10083       tree peel_mul = NULL_TREE;
10084       gimple_seq init_stmts = NULL;
10085       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10086         {
10087           if (SCALAR_FLOAT_TYPE_P (stept))
10088             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10089                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10090           else
10091             peel_mul = gimple_convert (&init_stmts, stept,
10092                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10093           peel_mul = gimple_build_vector_from_val (&init_stmts,
10094                                                    step_vectype, peel_mul);
10095         }
10096       unsigned ivn;
10097       auto_vec<tree> vec_steps;
10098       for (ivn = 0; ivn < nivs; ++ivn)
10099         {
10100           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10101           tree_vector_builder init_elts (vectype, const_nunits, 1);
10102           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10103           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10104             {
10105               /* The scalar steps of the IVs.  */
10106               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10107               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10108               step_elts.quick_push (elt);
10109               if (!init_node)
10110                 {
10111                   /* The scalar inits of the IVs if not vectorized.  */
10112                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10113                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10114                                                   TREE_TYPE (elt)))
10115                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10116                                         TREE_TYPE (vectype), elt);
10117                   init_elts.quick_push (elt);
10118                 }
10119               /* The number of steps to add to the initial values.  */
10120               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10121               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10122                                    ? build_real_from_wide (stept,
10123                                                            mul_elt, UNSIGNED)
10124                                    : build_int_cstu (stept, mul_elt));
10125             }
10126           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10127           vec_steps.safe_push (vec_step);
10128           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10129           if (peel_mul)
10130             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10131                                      step_mul, peel_mul);
10132           if (!init_node)
10133             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10134
10135           /* Create the induction-phi that defines the induction-operand.  */
10136           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10137                                             "vec_iv_");
10138           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10139           induc_def = PHI_RESULT (induction_phi);
10140
10141           /* Create the iv update inside the loop  */
10142           tree up = vec_step;
10143           if (lupdate_mul)
10144             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10145                                vec_step, lupdate_mul);
10146           gimple_seq stmts = NULL;
10147           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10148           vec_def = gimple_build (&stmts,
10149                                   PLUS_EXPR, step_vectype, vec_def, up);
10150           vec_def = gimple_convert (&stmts, vectype, vec_def);
10151           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10152           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10153                        UNKNOWN_LOCATION);
10154
10155           if (init_node)
10156             vec_init = vect_get_slp_vect_def (init_node, ivn);
10157           if (!nested_in_vect_loop
10158               && !integer_zerop (step_mul))
10159             {
10160               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10161               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10162                                  vec_step, step_mul);
10163               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10164                                       vec_def, up);
10165               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10166             }
10167
10168           /* Set the arguments of the phi node:  */
10169           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10170
10171           slp_node->push_vec_def (induction_phi);
10172         }
10173       if (!nested_in_vect_loop)
10174         {
10175           /* Fill up to the number of vectors we need for the whole group.  */
10176           nivs = least_common_multiple (group_size,
10177                                         const_nunits) / const_nunits;
10178           vec_steps.reserve (nivs-ivn);
10179           for (; ivn < nivs; ++ivn)
10180             {
10181               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10182               vec_steps.quick_push (vec_steps[0]);
10183             }
10184         }
10185
10186       /* Re-use IVs when we can.  We are generating further vector
10187          stmts by adding VF' * stride to the IVs generated above.  */
10188       if (ivn < nvects)
10189         {
10190           unsigned vfp
10191             = least_common_multiple (group_size, const_nunits) / group_size;
10192           tree lupdate_mul
10193             = build_vector_from_val (step_vectype,
10194                                      SCALAR_FLOAT_TYPE_P (stept)
10195                                      ? build_real_from_wide (stept,
10196                                                              vfp, UNSIGNED)
10197                                      : build_int_cstu (stept, vfp));
10198           for (; ivn < nvects; ++ivn)
10199             {
10200               gimple *iv
10201                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10202               tree def = gimple_get_lhs (iv);
10203               if (ivn < 2*nivs)
10204                 vec_steps[ivn - nivs]
10205                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10206                                   vec_steps[ivn - nivs], lupdate_mul);
10207               gimple_seq stmts = NULL;
10208               def = gimple_convert (&stmts, step_vectype, def);
10209               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10210                                   def, vec_steps[ivn % nivs]);
10211               def = gimple_convert (&stmts, vectype, def);
10212               if (gimple_code (iv) == GIMPLE_PHI)
10213                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214               else
10215                 {
10216                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10217                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10218                 }
10219               slp_node->push_vec_def (def);
10220             }
10221         }
10222
10223       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10224       gcc_assert (!new_bb);
10225
10226       return true;
10227     }
10228
10229   init_expr = vect_phi_initial_value (phi);
10230
10231   gimple_seq stmts = NULL;
10232   if (!nested_in_vect_loop)
10233     {
10234       /* Convert the initial value to the IV update type.  */
10235       tree new_type = TREE_TYPE (step_expr);
10236       init_expr = gimple_convert (&stmts, new_type, init_expr);
10237
10238       /* If we are using the loop mask to "peel" for alignment then we need
10239          to adjust the start value here.  */
10240       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10241       if (skip_niters != NULL_TREE)
10242         {
10243           if (FLOAT_TYPE_P (vectype))
10244             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10245                                         skip_niters);
10246           else
10247             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10248           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10249                                          skip_niters, step_expr);
10250           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10251                                     init_expr, skip_step);
10252         }
10253     }
10254
10255   if (stmts)
10256     {
10257       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10258       gcc_assert (!new_bb);
10259     }
10260
10261   /* Create the vector that holds the initial_value of the induction.  */
10262   if (nested_in_vect_loop)
10263     {
10264       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10265          been created during vectorization of previous stmts.  We obtain it
10266          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10267       auto_vec<tree> vec_inits;
10268       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10269                                      init_expr, &vec_inits);
10270       vec_init = vec_inits[0];
10271       /* If the initial value is not of proper type, convert it.  */
10272       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10273         {
10274           new_stmt
10275             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10276                                                           vect_simple_var,
10277                                                           "vec_iv_"),
10278                                    VIEW_CONVERT_EXPR,
10279                                    build1 (VIEW_CONVERT_EXPR, vectype,
10280                                            vec_init));
10281           vec_init = gimple_assign_lhs (new_stmt);
10282           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10283                                                  new_stmt);
10284           gcc_assert (!new_bb);
10285         }
10286     }
10287   else
10288     {
10289       /* iv_loop is the loop to be vectorized. Create:
10290          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10291       stmts = NULL;
10292       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10293
10294       unsigned HOST_WIDE_INT const_nunits;
10295       if (nunits.is_constant (&const_nunits))
10296         {
10297           tree_vector_builder elts (step_vectype, const_nunits, 1);
10298           elts.quick_push (new_name);
10299           for (i = 1; i < const_nunits; i++)
10300             {
10301               /* Create: new_name_i = new_name + step_expr  */
10302               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10303                                        new_name, step_expr);
10304               elts.quick_push (new_name);
10305             }
10306           /* Create a vector from [new_name_0, new_name_1, ...,
10307              new_name_nunits-1]  */
10308           vec_init = gimple_build_vector (&stmts, &elts);
10309         }
10310       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10311         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10312         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10313                                  new_name, step_expr);
10314       else
10315         {
10316           /* Build:
10317                 [base, base, base, ...]
10318                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10319           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10320           gcc_assert (flag_associative_math);
10321           tree index = build_index_vector (step_vectype, 0, 1);
10322           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10323                                                         new_name);
10324           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10325                                                         step_expr);
10326           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10327           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10328                                    vec_init, step_vec);
10329           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10330                                    vec_init, base_vec);
10331         }
10332       vec_init = gimple_convert (&stmts, vectype, vec_init);
10333
10334       if (stmts)
10335         {
10336           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10337           gcc_assert (!new_bb);
10338         }
10339     }
10340
10341
10342   /* Create the vector that holds the step of the induction.  */
10343   gimple_stmt_iterator *step_iv_si = NULL;
10344   if (nested_in_vect_loop)
10345     /* iv_loop is nested in the loop to be vectorized. Generate:
10346        vec_step = [S, S, S, S]  */
10347     new_name = step_expr;
10348   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10349     {
10350       /* When we're using loop_len produced by SELEC_VL, the non-final
10351          iterations are not always processing VF elements.  So vectorize
10352          induction variable instead of
10353
10354            _21 = vect_vec_iv_.6_22 + { VF, ... };
10355
10356          We should generate:
10357
10358            _35 = .SELECT_VL (ivtmp_33, VF);
10359            vect_cst__22 = [vec_duplicate_expr] _35;
10360            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10361       gcc_assert (!slp_node);
10362       gimple_seq seq = NULL;
10363       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10364       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10365       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10366                                                  unshare_expr (len)),
10367                                    &seq, true, NULL_TREE);
10368       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10369                                step_expr);
10370       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10371       step_iv_si = &si;
10372     }
10373   else
10374     {
10375       /* iv_loop is the loop to be vectorized. Generate:
10376           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10377       gimple_seq seq = NULL;
10378       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10379         {
10380           expr = build_int_cst (integer_type_node, vf);
10381           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10382         }
10383       else
10384         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10385       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10386                                expr, step_expr);
10387       if (seq)
10388         {
10389           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10390           gcc_assert (!new_bb);
10391         }
10392     }
10393
10394   t = unshare_expr (new_name);
10395   gcc_assert (CONSTANT_CLASS_P (new_name)
10396               || TREE_CODE (new_name) == SSA_NAME);
10397   new_vec = build_vector_from_val (step_vectype, t);
10398   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10399                                new_vec, step_vectype, step_iv_si);
10400
10401
10402   /* Create the following def-use cycle:
10403      loop prolog:
10404          vec_init = ...
10405          vec_step = ...
10406      loop:
10407          vec_iv = PHI <vec_init, vec_loop>
10408          ...
10409          STMT
10410          ...
10411          vec_loop = vec_iv + vec_step;  */
10412
10413   /* Create the induction-phi that defines the induction-operand.  */
10414   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10415   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10416   induc_def = PHI_RESULT (induction_phi);
10417
10418   /* Create the iv update inside the loop  */
10419   stmts = NULL;
10420   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10421   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10422   vec_def = gimple_convert (&stmts, vectype, vec_def);
10423   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10424   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10425
10426   /* Set the arguments of the phi node:  */
10427   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10428   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10429                UNKNOWN_LOCATION);
10430
10431   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10432   *vec_stmt = induction_phi;
10433
10434   /* In case that vectorization factor (VF) is bigger than the number
10435      of elements that we can fit in a vectype (nunits), we have to generate
10436      more than one vector stmt - i.e - we need to "unroll" the
10437      vector stmt by a factor VF/nunits.  For more details see documentation
10438      in vectorizable_operation.  */
10439
10440   if (ncopies > 1)
10441     {
10442       gimple_seq seq = NULL;
10443       /* FORNOW. This restriction should be relaxed.  */
10444       gcc_assert (!nested_in_vect_loop);
10445       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10446       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10447
10448       /* Create the vector that holds the step of the induction.  */
10449       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10450         {
10451           expr = build_int_cst (integer_type_node, nunits);
10452           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10453         }
10454       else
10455         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10456       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10457                                expr, step_expr);
10458       if (seq)
10459         {
10460           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10461           gcc_assert (!new_bb);
10462         }
10463
10464       t = unshare_expr (new_name);
10465       gcc_assert (CONSTANT_CLASS_P (new_name)
10466                   || TREE_CODE (new_name) == SSA_NAME);
10467       new_vec = build_vector_from_val (step_vectype, t);
10468       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10469                                    new_vec, step_vectype, NULL);
10470
10471       vec_def = induc_def;
10472       for (i = 1; i < ncopies + 1; i++)
10473         {
10474           /* vec_i = vec_prev + vec_step  */
10475           gimple_seq stmts = NULL;
10476           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10477           vec_def = gimple_build (&stmts,
10478                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10479           vec_def = gimple_convert (&stmts, vectype, vec_def);
10480
10481           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10482           if (i < ncopies)
10483             {
10484               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10485               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10486             }
10487           else
10488             {
10489               /* vec_1 = vec_iv + (VF/n * S)
10490                  vec_2 = vec_1 + (VF/n * S)
10491                  ...
10492                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10493
10494                  vec_n is used as vec_loop to save the large step register and
10495                  related operations.  */
10496               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10497                            UNKNOWN_LOCATION);
10498             }
10499         }
10500     }
10501
10502   if (dump_enabled_p ())
10503     dump_printf_loc (MSG_NOTE, vect_location,
10504                      "transform induction: created def-use cycle: %G%G",
10505                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10506
10507   return true;
10508 }
10509
10510 /* Function vectorizable_live_operation.
10511
10512    STMT_INFO computes a value that is used outside the loop.  Check if
10513    it can be supported.  */
10514
10515 bool
10516 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10517                              slp_tree slp_node, slp_instance slp_node_instance,
10518                              int slp_index, bool vec_stmt_p,
10519                              stmt_vector_for_cost *cost_vec)
10520 {
10521   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10522   imm_use_iterator imm_iter;
10523   tree lhs, lhs_type, bitsize;
10524   tree vectype = (slp_node
10525                   ? SLP_TREE_VECTYPE (slp_node)
10526                   : STMT_VINFO_VECTYPE (stmt_info));
10527   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10528   int ncopies;
10529   gimple *use_stmt;
10530   auto_vec<tree> vec_oprnds;
10531   int vec_entry = 0;
10532   poly_uint64 vec_index = 0;
10533
10534   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10535
10536   /* If a stmt of a reduction is live, vectorize it via
10537      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10538      validity so just trigger the transform here.  */
10539   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10540     {
10541       if (!vec_stmt_p)
10542         return true;
10543       if (slp_node)
10544         {
10545           /* For reduction chains the meta-info is attached to
10546              the group leader.  */
10547           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10548             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10549           /* For SLP reductions we vectorize the epilogue for
10550              all involved stmts together.  */
10551           else if (slp_index != 0)
10552             return true;
10553         }
10554       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10555       gcc_assert (reduc_info->is_reduc_info);
10556       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10557           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10558         return true;
10559       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10560                                         slp_node_instance);
10561       return true;
10562     }
10563
10564   /* If STMT is not relevant and it is a simple assignment and its inputs are
10565      invariant then it can remain in place, unvectorized.  The original last
10566      scalar value that it computes will be used.  */
10567   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10568     {
10569       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10570       if (dump_enabled_p ())
10571         dump_printf_loc (MSG_NOTE, vect_location,
10572                          "statement is simple and uses invariant.  Leaving in "
10573                          "place.\n");
10574       return true;
10575     }
10576
10577   if (slp_node)
10578     ncopies = 1;
10579   else
10580     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10581
10582   if (slp_node)
10583     {
10584       gcc_assert (slp_index >= 0);
10585
10586       /* Get the last occurrence of the scalar index from the concatenation of
10587          all the slp vectors. Calculate which slp vector it is and the index
10588          within.  */
10589       int num_scalar = SLP_TREE_LANES (slp_node);
10590       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10591       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10592
10593       /* Calculate which vector contains the result, and which lane of
10594          that vector we need.  */
10595       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10596         {
10597           if (dump_enabled_p ())
10598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10599                              "Cannot determine which vector holds the"
10600                              " final result.\n");
10601           return false;
10602         }
10603     }
10604
10605   if (!vec_stmt_p)
10606     {
10607       /* No transformation required.  */
10608       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10609         {
10610           if (slp_node)
10611             {
10612               if (dump_enabled_p ())
10613                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10614                                  "can't operate on partial vectors "
10615                                  "because an SLP statement is live after "
10616                                  "the loop.\n");
10617               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10618             }
10619           else if (ncopies > 1)
10620             {
10621               if (dump_enabled_p ())
10622                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10623                                  "can't operate on partial vectors "
10624                                  "because ncopies is greater than 1.\n");
10625               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10626             }
10627           else
10628             {
10629               gcc_assert (ncopies == 1 && !slp_node);
10630               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10631                                                   OPTIMIZE_FOR_SPEED))
10632                 vect_record_loop_mask (loop_vinfo,
10633                                        &LOOP_VINFO_MASKS (loop_vinfo),
10634                                        1, vectype, NULL);
10635               else if (can_vec_extract_var_idx_p (
10636                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10637                 vect_record_loop_len (loop_vinfo,
10638                                       &LOOP_VINFO_LENS (loop_vinfo),
10639                                       1, vectype, 1);
10640               else
10641                 {
10642                   if (dump_enabled_p ())
10643                     dump_printf_loc (
10644                       MSG_MISSED_OPTIMIZATION, vect_location,
10645                       "can't operate on partial vectors "
10646                       "because the target doesn't support extract "
10647                       "last reduction.\n");
10648                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10649                 }
10650             }
10651         }
10652       /* ???  Enable for loop costing as well.  */
10653       if (!loop_vinfo)
10654         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10655                           0, vect_epilogue);
10656       return true;
10657     }
10658
10659   /* Use the lhs of the original scalar statement.  */
10660   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10661   if (dump_enabled_p ())
10662     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10663                      "stmt %G", stmt);
10664
10665   lhs = gimple_get_lhs (stmt);
10666   lhs_type = TREE_TYPE (lhs);
10667
10668   bitsize = vector_element_bits_tree (vectype);
10669
10670   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10671   tree vec_lhs, bitstart;
10672   gimple *vec_stmt;
10673   if (slp_node)
10674     {
10675       gcc_assert (!loop_vinfo
10676                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10677                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10678
10679       /* Get the correct slp vectorized stmt.  */
10680       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10681       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10682
10683       /* Get entry to use.  */
10684       bitstart = bitsize_int (vec_index);
10685       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10686     }
10687   else
10688     {
10689       /* For multiple copies, get the last copy.  */
10690       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10691       vec_lhs = gimple_get_lhs (vec_stmt);
10692
10693       /* Get the last lane in the vector.  */
10694       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10695     }
10696
10697   if (loop_vinfo)
10698     {
10699       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10700          requirement, insert one phi node for it.  It looks like:
10701            loop;
10702          BB:
10703            # lhs' = PHI <lhs>
10704          ==>
10705            loop;
10706          BB:
10707            # vec_lhs' = PHI <vec_lhs>
10708            new_tree = lane_extract <vec_lhs', ...>;
10709            lhs' = new_tree;  */
10710
10711       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10712       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10713       gcc_assert (single_pred_p (exit_bb));
10714
10715       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10716       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10717       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10718
10719       gimple_seq stmts = NULL;
10720       tree new_tree;
10721       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10722         {
10723           /* Emit:
10724
10725                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10726
10727              where VEC_LHS is the vectorized live-out result and MASK is
10728              the loop mask for the final iteration.  */
10729           gcc_assert (ncopies == 1 && !slp_node);
10730           gimple_seq tem = NULL;
10731           gimple_stmt_iterator gsi = gsi_last (tem);
10732           tree len
10733             = vect_get_loop_len (loop_vinfo, &gsi,
10734                                  &LOOP_VINFO_LENS (loop_vinfo),
10735                                  1, vectype, 0, 0);
10736
10737           /* BIAS - 1.  */
10738           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10739           tree bias_minus_one
10740             = int_const_binop (MINUS_EXPR,
10741                                build_int_cst (TREE_TYPE (len), biasval),
10742                                build_one_cst (TREE_TYPE (len)));
10743
10744           /* LAST_INDEX = LEN + (BIAS - 1).  */
10745           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10746                                           len, bias_minus_one);
10747
10748           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10749           tree scalar_res
10750             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10751                             vec_lhs_phi, last_index);
10752
10753           /* Convert the extracted vector element to the scalar type.  */
10754           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10755         }
10756       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10757         {
10758           /* Emit:
10759
10760                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10761
10762              where VEC_LHS is the vectorized live-out result and MASK is
10763              the loop mask for the final iteration.  */
10764           gcc_assert (ncopies == 1 && !slp_node);
10765           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10766           gimple_seq tem = NULL;
10767           gimple_stmt_iterator gsi = gsi_last (tem);
10768           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10769                                           &LOOP_VINFO_MASKS (loop_vinfo),
10770                                           1, vectype, 0);
10771           gimple_seq_add_seq (&stmts, tem);
10772           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10773                                           mask, vec_lhs_phi);
10774
10775           /* Convert the extracted vector element to the scalar type.  */
10776           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10777         }
10778       else
10779         {
10780           tree bftype = TREE_TYPE (vectype);
10781           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10782             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10783           new_tree = build3 (BIT_FIELD_REF, bftype,
10784                              vec_lhs_phi, bitsize, bitstart);
10785           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10786                                            &stmts, true, NULL_TREE);
10787         }
10788
10789       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10790       if (stmts)
10791         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10792
10793       /* Remove existing phis that copy from lhs and create copies
10794          from new_tree.  */
10795       gimple_stmt_iterator gsi;
10796       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10797         {
10798           gimple *phi = gsi_stmt (gsi);
10799           if ((gimple_phi_arg_def (phi, 0) == lhs))
10800             {
10801               remove_phi_node (&gsi, false);
10802               tree lhs_phi = gimple_phi_result (phi);
10803               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10804               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10805             }
10806           else
10807             gsi_next (&gsi);
10808         }
10809
10810       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10811       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10812         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10813     }
10814   else
10815     {
10816       /* For basic-block vectorization simply insert the lane-extraction.  */
10817       tree bftype = TREE_TYPE (vectype);
10818       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10819         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10820       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10821                               vec_lhs, bitsize, bitstart);
10822       gimple_seq stmts = NULL;
10823       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10824                                        &stmts, true, NULL_TREE);
10825       if (TREE_CODE (new_tree) == SSA_NAME
10826           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10827         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10828       if (is_a <gphi *> (vec_stmt))
10829         {
10830           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10831           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10832         }
10833       else
10834         {
10835           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10836           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10837         }
10838
10839       /* Replace use of lhs with newly computed result.  If the use stmt is a
10840          single arg PHI, just replace all uses of PHI result.  It's necessary
10841          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10842       use_operand_p use_p;
10843       stmt_vec_info use_stmt_info;
10844       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10845         if (!is_gimple_debug (use_stmt)
10846             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10847                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10848           {
10849             /* ???  This can happen when the live lane ends up being
10850                rooted in a vector construction code-generated by an
10851                external SLP node (and code-generation for that already
10852                happened).  See gcc.dg/vect/bb-slp-47.c.
10853                Doing this is what would happen if that vector CTOR
10854                were not code-generated yet so it is not too bad.
10855                ???  In fact we'd likely want to avoid this situation
10856                in the first place.  */
10857             if (TREE_CODE (new_tree) == SSA_NAME
10858                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10859                 && gimple_code (use_stmt) != GIMPLE_PHI
10860                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10861                                                 use_stmt))
10862               {
10863                 if (dump_enabled_p ())
10864                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10865                                    "Using original scalar computation for "
10866                                    "live lane because use preceeds vector "
10867                                    "def\n");
10868                 continue;
10869               }
10870             /* ???  It can also happen that we end up pulling a def into
10871                a loop where replacing out-of-loop uses would require
10872                a new LC SSA PHI node.  Retain the original scalar in
10873                those cases as well.  PR98064.  */
10874             if (TREE_CODE (new_tree) == SSA_NAME
10875                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10876                 && (gimple_bb (use_stmt)->loop_father
10877                     != gimple_bb (vec_stmt)->loop_father)
10878                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10879                                         gimple_bb (use_stmt)->loop_father))
10880               {
10881                 if (dump_enabled_p ())
10882                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10883                                    "Using original scalar computation for "
10884                                    "live lane because there is an out-of-loop "
10885                                    "definition for it\n");
10886                 continue;
10887               }
10888             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10889               SET_USE (use_p, new_tree);
10890             update_stmt (use_stmt);
10891           }
10892     }
10893
10894   return true;
10895 }
10896
10897 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10898
10899 static void
10900 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10901 {
10902   ssa_op_iter op_iter;
10903   imm_use_iterator imm_iter;
10904   def_operand_p def_p;
10905   gimple *ustmt;
10906
10907   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10908     {
10909       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10910         {
10911           basic_block bb;
10912
10913           if (!is_gimple_debug (ustmt))
10914             continue;
10915
10916           bb = gimple_bb (ustmt);
10917
10918           if (!flow_bb_inside_loop_p (loop, bb))
10919             {
10920               if (gimple_debug_bind_p (ustmt))
10921                 {
10922                   if (dump_enabled_p ())
10923                     dump_printf_loc (MSG_NOTE, vect_location,
10924                                      "killing debug use\n");
10925
10926                   gimple_debug_bind_reset_value (ustmt);
10927                   update_stmt (ustmt);
10928                 }
10929               else
10930                 gcc_unreachable ();
10931             }
10932         }
10933     }
10934 }
10935
10936 /* Given loop represented by LOOP_VINFO, return true if computation of
10937    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10938    otherwise.  */
10939
10940 static bool
10941 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10942 {
10943   /* Constant case.  */
10944   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10945     {
10946       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10947       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10948
10949       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10950       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10951       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10952         return true;
10953     }
10954
10955   widest_int max;
10956   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10957   /* Check the upper bound of loop niters.  */
10958   if (get_max_loop_iterations (loop, &max))
10959     {
10960       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10961       signop sgn = TYPE_SIGN (type);
10962       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10963       if (max < type_max)
10964         return true;
10965     }
10966   return false;
10967 }
10968
10969 /* Return a mask type with half the number of elements as OLD_TYPE,
10970    given that it should have mode NEW_MODE.  */
10971
10972 tree
10973 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10974 {
10975   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10976   return build_truth_vector_type_for_mode (nunits, new_mode);
10977 }
10978
10979 /* Return a mask type with twice as many elements as OLD_TYPE,
10980    given that it should have mode NEW_MODE.  */
10981
10982 tree
10983 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10984 {
10985   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10986   return build_truth_vector_type_for_mode (nunits, new_mode);
10987 }
10988
10989 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10990    contain a sequence of NVECTORS masks that each control a vector of type
10991    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10992    these vector masks with the vector version of SCALAR_MASK.  */
10993
10994 void
10995 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10996                        unsigned int nvectors, tree vectype, tree scalar_mask)
10997 {
10998   gcc_assert (nvectors != 0);
10999
11000   if (scalar_mask)
11001     {
11002       scalar_cond_masked_key cond (scalar_mask, nvectors);
11003       loop_vinfo->scalar_cond_masked_set.add (cond);
11004     }
11005
11006   masks->mask_set.add (std::make_pair (vectype, nvectors));
11007 }
11008
11009 /* Given a complete set of masks MASKS, extract mask number INDEX
11010    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11011    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11012
11013    See the comment above vec_loop_masks for more details about the mask
11014    arrangement.  */
11015
11016 tree
11017 vect_get_loop_mask (loop_vec_info loop_vinfo,
11018                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11019                     unsigned int nvectors, tree vectype, unsigned int index)
11020 {
11021   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11022       == vect_partial_vectors_while_ult)
11023     {
11024       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11025       tree mask_type = rgm->type;
11026
11027       /* Populate the rgroup's mask array, if this is the first time we've
11028          used it.  */
11029       if (rgm->controls.is_empty ())
11030         {
11031           rgm->controls.safe_grow_cleared (nvectors, true);
11032           for (unsigned int i = 0; i < nvectors; ++i)
11033             {
11034               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11035               /* Provide a dummy definition until the real one is available.  */
11036               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11037               rgm->controls[i] = mask;
11038             }
11039         }
11040
11041       tree mask = rgm->controls[index];
11042       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11043                     TYPE_VECTOR_SUBPARTS (vectype)))
11044         {
11045           /* A loop mask for data type X can be reused for data type Y
11046              if X has N times more elements than Y and if Y's elements
11047              are N times bigger than X's.  In this case each sequence
11048              of N elements in the loop mask will be all-zero or all-one.
11049              We can then view-convert the mask so that each sequence of
11050              N elements is replaced by a single element.  */
11051           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11052                                   TYPE_VECTOR_SUBPARTS (vectype)));
11053           gimple_seq seq = NULL;
11054           mask_type = truth_type_for (vectype);
11055           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11056           if (seq)
11057             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11058         }
11059       return mask;
11060     }
11061   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11062            == vect_partial_vectors_avx512)
11063     {
11064       /* The number of scalars per iteration and the number of vectors are
11065          both compile-time constants.  */
11066       unsigned int nscalars_per_iter
11067         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11068                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11069
11070       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11071
11072       /* The stored nV is dependent on the mask type produced.  */
11073       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11074                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11075                   == rgm->factor);
11076       nvectors = rgm->factor;
11077
11078       /* Populate the rgroup's mask array, if this is the first time we've
11079          used it.  */
11080       if (rgm->controls.is_empty ())
11081         {
11082           rgm->controls.safe_grow_cleared (nvectors, true);
11083           for (unsigned int i = 0; i < nvectors; ++i)
11084             {
11085               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11086               /* Provide a dummy definition until the real one is available.  */
11087               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11088               rgm->controls[i] = mask;
11089             }
11090         }
11091       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11092                     TYPE_VECTOR_SUBPARTS (vectype)))
11093         return rgm->controls[index];
11094
11095       /* Split the vector if needed.  Since we are dealing with integer mode
11096          masks with AVX512 we can operate on the integer representation
11097          performing the whole vector shifting.  */
11098       unsigned HOST_WIDE_INT factor;
11099       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11100                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11101       gcc_assert (ok);
11102       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11103       tree mask_type = truth_type_for (vectype);
11104       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11105       unsigned vi = index / factor;
11106       unsigned vpart = index % factor;
11107       tree vec = rgm->controls[vi];
11108       gimple_seq seq = NULL;
11109       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11110                           lang_hooks.types.type_for_mode
11111                                 (TYPE_MODE (rgm->type), 1), vec);
11112       /* For integer mode masks simply shift the right bits into position.  */
11113       if (vpart != 0)
11114         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11115                             build_int_cst (integer_type_node,
11116                                            (TYPE_VECTOR_SUBPARTS (vectype)
11117                                             * vpart)));
11118       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11119                                     (TYPE_MODE (mask_type), 1), vec);
11120       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11121       if (seq)
11122         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11123       return vec;
11124     }
11125   else
11126     gcc_unreachable ();
11127 }
11128
11129 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11130    lengths for controlling an operation on VECTYPE.  The operation splits
11131    each element of VECTYPE into FACTOR separate subelements, measuring the
11132    length as a number of these subelements.  */
11133
11134 void
11135 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11136                       unsigned int nvectors, tree vectype, unsigned int factor)
11137 {
11138   gcc_assert (nvectors != 0);
11139   if (lens->length () < nvectors)
11140     lens->safe_grow_cleared (nvectors, true);
11141   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11142
11143   /* The number of scalars per iteration, scalar occupied bytes and
11144      the number of vectors are both compile-time constants.  */
11145   unsigned int nscalars_per_iter
11146     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11147                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11148
11149   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11150     {
11151       /* For now, we only support cases in which all loads and stores fall back
11152          to VnQI or none do.  */
11153       gcc_assert (!rgl->max_nscalars_per_iter
11154                   || (rgl->factor == 1 && factor == 1)
11155                   || (rgl->max_nscalars_per_iter * rgl->factor
11156                       == nscalars_per_iter * factor));
11157       rgl->max_nscalars_per_iter = nscalars_per_iter;
11158       rgl->type = vectype;
11159       rgl->factor = factor;
11160     }
11161 }
11162
11163 /* Given a complete set of lengths LENS, extract length number INDEX
11164    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11165    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11166    multipled by the number of elements that should be processed.
11167    Insert any set-up statements before GSI.  */
11168
11169 tree
11170 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11171                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11172                    unsigned int index, unsigned int factor)
11173 {
11174   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11175   bool use_bias_adjusted_len =
11176     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11177
11178   /* Populate the rgroup's len array, if this is the first time we've
11179      used it.  */
11180   if (rgl->controls.is_empty ())
11181     {
11182       rgl->controls.safe_grow_cleared (nvectors, true);
11183       for (unsigned int i = 0; i < nvectors; ++i)
11184         {
11185           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11186           gcc_assert (len_type != NULL_TREE);
11187
11188           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11189
11190           /* Provide a dummy definition until the real one is available.  */
11191           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11192           rgl->controls[i] = len;
11193
11194           if (use_bias_adjusted_len)
11195             {
11196               gcc_assert (i == 0);
11197               tree adjusted_len =
11198                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11199               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11200               rgl->bias_adjusted_ctrl = adjusted_len;
11201             }
11202         }
11203     }
11204
11205   if (use_bias_adjusted_len)
11206     return rgl->bias_adjusted_ctrl;
11207
11208   tree loop_len = rgl->controls[index];
11209   if (rgl->factor == 1 && factor == 1)
11210     {
11211       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11212       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11213       if (maybe_ne (nunits1, nunits2))
11214         {
11215           /* A loop len for data type X can be reused for data type Y
11216              if X has N times more elements than Y and if Y's elements
11217              are N times bigger than X's.  */
11218           gcc_assert (multiple_p (nunits1, nunits2));
11219           factor = exact_div (nunits1, nunits2).to_constant ();
11220           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11221           gimple_seq seq = NULL;
11222           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11223                                    build_int_cst (iv_type, factor));
11224           if (seq)
11225             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11226         }
11227     }
11228   return loop_len;
11229 }
11230
11231 /* Scale profiling counters by estimation for LOOP which is vectorized
11232    by factor VF.
11233    If FLAT is true, the loop we started with had unrealistically flat
11234    profile.  */
11235
11236 static void
11237 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11238 {
11239   /* For flat profiles do not scale down proportionally by VF and only
11240      cap by known iteration count bounds.  */
11241   if (flat)
11242     {
11243       if (dump_file && (dump_flags & TDF_DETAILS))
11244         fprintf (dump_file,
11245                  "Vectorized loop profile seems flat; not scaling iteration "
11246                  "count down by the vectorization factor %i\n", vf);
11247       scale_loop_profile (loop, profile_probability::always (),
11248                           get_likely_max_loop_iterations_int (loop));
11249       return;
11250     }
11251   /* Loop body executes VF fewer times and exit increases VF times.  */
11252   profile_count entry_count = loop_preheader_edge (loop)->count ();
11253
11254   /* If we have unreliable loop profile avoid dropping entry
11255      count bellow header count.  This can happen since loops
11256      has unrealistically low trip counts.  */
11257   while (vf > 1
11258          && loop->header->count > entry_count
11259          && loop->header->count < entry_count * vf)
11260     {
11261       if (dump_file && (dump_flags & TDF_DETAILS))
11262         fprintf (dump_file,
11263                  "Vectorization factor %i seems too large for profile "
11264                  "prevoiusly believed to be consistent; reducing.\n", vf);
11265       vf /= 2;
11266     }
11267
11268   if (entry_count.nonzero_p ())
11269     set_edge_probability_and_rescale_others
11270             (exit_e,
11271              entry_count.probability_in (loop->header->count / vf));
11272   /* Avoid producing very large exit probability when we do not have
11273      sensible profile.  */
11274   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11275     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11276   loop->latch->count = single_pred_edge (loop->latch)->count ();
11277
11278   scale_loop_profile (loop, profile_probability::always () / vf,
11279                       get_likely_max_loop_iterations_int (loop));
11280 }
11281
11282 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11283    latch edge values originally defined by it.  */
11284
11285 static void
11286 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11287                                      stmt_vec_info def_stmt_info)
11288 {
11289   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11290   if (!def || TREE_CODE (def) != SSA_NAME)
11291     return;
11292   stmt_vec_info phi_info;
11293   imm_use_iterator iter;
11294   use_operand_p use_p;
11295   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11296     {
11297       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11298       if (!phi)
11299         continue;
11300       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11301             && (phi_info = loop_vinfo->lookup_stmt (phi))
11302             && STMT_VINFO_RELEVANT_P (phi_info)))
11303         continue;
11304       loop_p loop = gimple_bb (phi)->loop_father;
11305       edge e = loop_latch_edge (loop);
11306       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11307         continue;
11308
11309       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11310           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11311           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11312         {
11313           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11314           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11315           gcc_assert (phi_defs.length () == latch_defs.length ());
11316           for (unsigned i = 0; i < phi_defs.length (); ++i)
11317             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11318                          gimple_get_lhs (latch_defs[i]), e,
11319                          gimple_phi_arg_location (phi, e->dest_idx));
11320         }
11321       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11322         {
11323           /* For first order recurrences we have to update both uses of
11324              the latch definition, the one in the PHI node and the one
11325              in the generated VEC_PERM_EXPR.  */
11326           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11327           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11328           gcc_assert (phi_defs.length () == latch_defs.length ());
11329           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11330           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11331           for (unsigned i = 0; i < phi_defs.length (); ++i)
11332             {
11333               gassign *perm = as_a <gassign *> (phi_defs[i]);
11334               if (i > 0)
11335                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11336               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11337               update_stmt (perm);
11338             }
11339           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11340                        gimple_phi_arg_location (phi, e->dest_idx));
11341         }
11342     }
11343 }
11344
11345 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11346    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11347    stmt_vec_info.  */
11348
11349 static bool
11350 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11351                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11352 {
11353   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11354   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11355
11356   if (dump_enabled_p ())
11357     dump_printf_loc (MSG_NOTE, vect_location,
11358                      "------>vectorizing statement: %G", stmt_info->stmt);
11359
11360   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11361     vect_loop_kill_debug_uses (loop, stmt_info);
11362
11363   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11364       && !STMT_VINFO_LIVE_P (stmt_info))
11365     {
11366       if (is_gimple_call (stmt_info->stmt)
11367           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11368         {
11369           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11370           *seen_store = stmt_info;
11371           return false;
11372         }
11373       return false;
11374     }
11375
11376   if (STMT_VINFO_VECTYPE (stmt_info))
11377     {
11378       poly_uint64 nunits
11379         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11380       if (!STMT_SLP_TYPE (stmt_info)
11381           && maybe_ne (nunits, vf)
11382           && dump_enabled_p ())
11383         /* For SLP VF is set according to unrolling factor, and not
11384            to vector size, hence for SLP this print is not valid.  */
11385         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11386     }
11387
11388   /* Pure SLP statements have already been vectorized.  We still need
11389      to apply loop vectorization to hybrid SLP statements.  */
11390   if (PURE_SLP_STMT (stmt_info))
11391     return false;
11392
11393   if (dump_enabled_p ())
11394     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11395
11396   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11397     *seen_store = stmt_info;
11398
11399   return true;
11400 }
11401
11402 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11403    in the hash_map with its corresponding values.  */
11404
11405 static tree
11406 find_in_mapping (tree t, void *context)
11407 {
11408   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11409
11410   tree *value = mapping->get (t);
11411   return value ? *value : t;
11412 }
11413
11414 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11415    original loop that has now been vectorized.
11416
11417    The inits of the data_references need to be advanced with the number of
11418    iterations of the main loop.  This has been computed in vect_do_peeling and
11419    is stored in parameter ADVANCE.  We first restore the data_references
11420    initial offset with the values recored in ORIG_DRS_INIT.
11421
11422    Since the loop_vec_info of this EPILOGUE was constructed for the original
11423    loop, its stmt_vec_infos all point to the original statements.  These need
11424    to be updated to point to their corresponding copies as well as the SSA_NAMES
11425    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11426
11427    The data_reference's connections also need to be updated.  Their
11428    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11429    stmt_vec_infos, their statements need to point to their corresponding copy,
11430    if they are gather loads or scatter stores then their reference needs to be
11431    updated to point to its corresponding copy and finally we set
11432    'base_misaligned' to false as we have already peeled for alignment in the
11433    prologue of the main loop.  */
11434
11435 static void
11436 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11437 {
11438   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11439   auto_vec<gimple *> stmt_worklist;
11440   hash_map<tree,tree> mapping;
11441   gimple *orig_stmt, *new_stmt;
11442   gimple_stmt_iterator epilogue_gsi;
11443   gphi_iterator epilogue_phi_gsi;
11444   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11445   basic_block *epilogue_bbs = get_loop_body (epilogue);
11446   unsigned i;
11447
11448   free (LOOP_VINFO_BBS (epilogue_vinfo));
11449   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11450
11451   /* Advance data_reference's with the number of iterations of the previous
11452      loop and its prologue.  */
11453   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11454
11455
11456   /* The EPILOGUE loop is a copy of the original loop so they share the same
11457      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11458      point to the copied statements.  We also create a mapping of all LHS' in
11459      the original loop and all the LHS' in the EPILOGUE and create worklists to
11460      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11461   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11462     {
11463       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11464            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11465         {
11466           new_stmt = epilogue_phi_gsi.phi ();
11467
11468           gcc_assert (gimple_uid (new_stmt) > 0);
11469           stmt_vinfo
11470             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11471
11472           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11473           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11474
11475           mapping.put (gimple_phi_result (orig_stmt),
11476                        gimple_phi_result (new_stmt));
11477           /* PHI nodes can not have patterns or related statements.  */
11478           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11479                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11480         }
11481
11482       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11483            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11484         {
11485           new_stmt = gsi_stmt (epilogue_gsi);
11486           if (is_gimple_debug (new_stmt))
11487             continue;
11488
11489           gcc_assert (gimple_uid (new_stmt) > 0);
11490           stmt_vinfo
11491             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11492
11493           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11494           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11495
11496           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11497             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11498
11499           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11500             {
11501               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11502               for (gimple_stmt_iterator gsi = gsi_start (seq);
11503                    !gsi_end_p (gsi); gsi_next (&gsi))
11504                 stmt_worklist.safe_push (gsi_stmt (gsi));
11505             }
11506
11507           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11508           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11509             {
11510               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11511               stmt_worklist.safe_push (stmt);
11512               /* Set BB such that the assert in
11513                 'get_initial_def_for_reduction' is able to determine that
11514                 the BB of the related stmt is inside this loop.  */
11515               gimple_set_bb (stmt,
11516                              gimple_bb (new_stmt));
11517               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11518               gcc_assert (related_vinfo == NULL
11519                           || related_vinfo == stmt_vinfo);
11520             }
11521         }
11522     }
11523
11524   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11525      using the original main loop and thus need to be updated to refer to the
11526      cloned variables used in the epilogue.  */
11527   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11528     {
11529       gimple *stmt = stmt_worklist[i];
11530       tree *new_op;
11531
11532       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11533         {
11534           tree op = gimple_op (stmt, j);
11535           if ((new_op = mapping.get(op)))
11536             gimple_set_op (stmt, j, *new_op);
11537           else
11538             {
11539               /* PR92429: The last argument of simplify_replace_tree disables
11540                  folding when replacing arguments.  This is required as
11541                  otherwise you might end up with different statements than the
11542                  ones analyzed in vect_loop_analyze, leading to different
11543                  vectorization.  */
11544               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11545                                           &find_in_mapping, &mapping, false);
11546               gimple_set_op (stmt, j, op);
11547             }
11548         }
11549     }
11550
11551   struct data_reference *dr;
11552   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11553   FOR_EACH_VEC_ELT (datarefs, i, dr)
11554     {
11555       orig_stmt = DR_STMT (dr);
11556       gcc_assert (gimple_uid (orig_stmt) > 0);
11557       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11558       /* Data references for gather loads and scatter stores do not use the
11559          updated offset we set using ADVANCE.  Instead we have to make sure the
11560          reference in the data references point to the corresponding copy of
11561          the original in the epilogue.  Make sure to update both
11562          gather/scatters recognized by dataref analysis and also other
11563          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11564       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11565       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11566           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11567         {
11568           DR_REF (dr)
11569             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11570                                      &find_in_mapping, &mapping);
11571           DR_BASE_ADDRESS (dr)
11572             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11573                                      &find_in_mapping, &mapping);
11574         }
11575       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11576       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11577       /* The vector size of the epilogue is smaller than that of the main loop
11578          so the alignment is either the same or lower. This means the dr will
11579          thus by definition be aligned.  */
11580       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11581     }
11582
11583   epilogue_vinfo->shared->datarefs_copy.release ();
11584   epilogue_vinfo->shared->save_datarefs ();
11585 }
11586
11587 /* Function vect_transform_loop.
11588
11589    The analysis phase has determined that the loop is vectorizable.
11590    Vectorize the loop - created vectorized stmts to replace the scalar
11591    stmts in the loop, and update the loop exit condition.
11592    Returns scalar epilogue loop if any.  */
11593
11594 class loop *
11595 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11596 {
11597   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11598   class loop *epilogue = NULL;
11599   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11600   int nbbs = loop->num_nodes;
11601   int i;
11602   tree niters_vector = NULL_TREE;
11603   tree step_vector = NULL_TREE;
11604   tree niters_vector_mult_vf = NULL_TREE;
11605   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11606   unsigned int lowest_vf = constant_lower_bound (vf);
11607   gimple *stmt;
11608   bool check_profitability = false;
11609   unsigned int th;
11610   bool flat = maybe_flat_loop_profile (loop);
11611
11612   DUMP_VECT_SCOPE ("vec_transform_loop");
11613
11614   loop_vinfo->shared->check_datarefs ();
11615
11616   /* Use the more conservative vectorization threshold.  If the number
11617      of iterations is constant assume the cost check has been performed
11618      by our caller.  If the threshold makes all loops profitable that
11619      run at least the (estimated) vectorization factor number of times
11620      checking is pointless, too.  */
11621   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11622   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11623     {
11624       if (dump_enabled_p ())
11625         dump_printf_loc (MSG_NOTE, vect_location,
11626                          "Profitability threshold is %d loop iterations.\n",
11627                          th);
11628       check_profitability = true;
11629     }
11630
11631   /* Make sure there exists a single-predecessor exit bb.  Do this before
11632      versioning.   */
11633   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11634   if (! single_pred_p (e->dest))
11635     {
11636       split_loop_exit_edge (e, true);
11637       if (dump_enabled_p ())
11638         dump_printf (MSG_NOTE, "split exit edge\n");
11639     }
11640
11641   /* Version the loop first, if required, so the profitability check
11642      comes first.  */
11643
11644   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11645     {
11646       class loop *sloop
11647         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11648       sloop->force_vectorize = false;
11649       check_profitability = false;
11650     }
11651
11652   /* Make sure there exists a single-predecessor exit bb also on the
11653      scalar loop copy.  Do this after versioning but before peeling
11654      so CFG structure is fine for both scalar and if-converted loop
11655      to make slpeel_duplicate_current_defs_from_edges face matched
11656      loop closed PHI nodes on the exit.  */
11657   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11658     {
11659       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11660       if (! single_pred_p (e->dest))
11661         {
11662           split_loop_exit_edge (e, true);
11663           if (dump_enabled_p ())
11664             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11665         }
11666     }
11667
11668   tree niters = vect_build_loop_niters (loop_vinfo);
11669   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11670   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11671   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11672   tree advance;
11673   drs_init_vec orig_drs_init;
11674
11675   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11676                               &step_vector, &niters_vector_mult_vf, th,
11677                               check_profitability, niters_no_overflow,
11678                               &advance);
11679   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11680       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11681     {
11682       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11683          block after loop exit.  We need to scale all that.  */
11684       basic_block preheader
11685         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11686       preheader->count
11687         = preheader->count.apply_probability
11688               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11689       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11690                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11691       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11692         = preheader->count;
11693     }
11694
11695   if (niters_vector == NULL_TREE)
11696     {
11697       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11698           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11699           && known_eq (lowest_vf, vf))
11700         {
11701           niters_vector
11702             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11703                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11704           step_vector = build_one_cst (TREE_TYPE (niters));
11705         }
11706       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11707         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11708                                      &step_vector, niters_no_overflow);
11709       else
11710         /* vect_do_peeling subtracted the number of peeled prologue
11711            iterations from LOOP_VINFO_NITERS.  */
11712         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11713                                      &niters_vector, &step_vector,
11714                                      niters_no_overflow);
11715     }
11716
11717   /* 1) Make sure the loop header has exactly two entries
11718      2) Make sure we have a preheader basic block.  */
11719
11720   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11721
11722   split_edge (loop_preheader_edge (loop));
11723
11724   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11725     /* This will deal with any possible peeling.  */
11726     vect_prepare_for_masked_peels (loop_vinfo);
11727
11728   /* Schedule the SLP instances first, then handle loop vectorization
11729      below.  */
11730   if (!loop_vinfo->slp_instances.is_empty ())
11731     {
11732       DUMP_VECT_SCOPE ("scheduling SLP instances");
11733       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11734     }
11735
11736   /* FORNOW: the vectorizer supports only loops which body consist
11737      of one basic block (header + empty latch). When the vectorizer will
11738      support more involved loop forms, the order by which the BBs are
11739      traversed need to be reconsidered.  */
11740
11741   for (i = 0; i < nbbs; i++)
11742     {
11743       basic_block bb = bbs[i];
11744       stmt_vec_info stmt_info;
11745
11746       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11747            gsi_next (&si))
11748         {
11749           gphi *phi = si.phi ();
11750           if (dump_enabled_p ())
11751             dump_printf_loc (MSG_NOTE, vect_location,
11752                              "------>vectorizing phi: %G", (gimple *) phi);
11753           stmt_info = loop_vinfo->lookup_stmt (phi);
11754           if (!stmt_info)
11755             continue;
11756
11757           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11758             vect_loop_kill_debug_uses (loop, stmt_info);
11759
11760           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11761               && !STMT_VINFO_LIVE_P (stmt_info))
11762             continue;
11763
11764           if (STMT_VINFO_VECTYPE (stmt_info)
11765               && (maybe_ne
11766                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11767               && dump_enabled_p ())
11768             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11769
11770           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11771                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11772                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11773                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11774                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11775                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11776               && ! PURE_SLP_STMT (stmt_info))
11777             {
11778               if (dump_enabled_p ())
11779                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11780               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11781             }
11782         }
11783
11784       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11785            gsi_next (&si))
11786         {
11787           gphi *phi = si.phi ();
11788           stmt_info = loop_vinfo->lookup_stmt (phi);
11789           if (!stmt_info)
11790             continue;
11791
11792           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11793               && !STMT_VINFO_LIVE_P (stmt_info))
11794             continue;
11795
11796           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11797                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11798                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11799                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11800                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11801                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11802               && ! PURE_SLP_STMT (stmt_info))
11803             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11804         }
11805
11806       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11807            !gsi_end_p (si);)
11808         {
11809           stmt = gsi_stmt (si);
11810           /* During vectorization remove existing clobber stmts.  */
11811           if (gimple_clobber_p (stmt))
11812             {
11813               unlink_stmt_vdef (stmt);
11814               gsi_remove (&si, true);
11815               release_defs (stmt);
11816             }
11817           else
11818             {
11819               /* Ignore vector stmts created in the outer loop.  */
11820               stmt_info = loop_vinfo->lookup_stmt (stmt);
11821
11822               /* vector stmts created in the outer-loop during vectorization of
11823                  stmts in an inner-loop may not have a stmt_info, and do not
11824                  need to be vectorized.  */
11825               stmt_vec_info seen_store = NULL;
11826               if (stmt_info)
11827                 {
11828                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11829                     {
11830                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11831                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11832                            !gsi_end_p (subsi); gsi_next (&subsi))
11833                         {
11834                           stmt_vec_info pat_stmt_info
11835                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11836                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11837                                                     &si, &seen_store);
11838                         }
11839                       stmt_vec_info pat_stmt_info
11840                         = STMT_VINFO_RELATED_STMT (stmt_info);
11841                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11842                                                     &si, &seen_store))
11843                         maybe_set_vectorized_backedge_value (loop_vinfo,
11844                                                              pat_stmt_info);
11845                     }
11846                   else
11847                     {
11848                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11849                                                     &seen_store))
11850                         maybe_set_vectorized_backedge_value (loop_vinfo,
11851                                                              stmt_info);
11852                     }
11853                 }
11854               gsi_next (&si);
11855               if (seen_store)
11856                 {
11857                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11858                     /* Interleaving.  If IS_STORE is TRUE, the
11859                        vectorization of the interleaving chain was
11860                        completed - free all the stores in the chain.  */
11861                     vect_remove_stores (loop_vinfo,
11862                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11863                   else
11864                     /* Free the attached stmt_vec_info and remove the stmt.  */
11865                     loop_vinfo->remove_stmt (stmt_info);
11866                 }
11867             }
11868         }
11869
11870       /* Stub out scalar statements that must not survive vectorization.
11871          Doing this here helps with grouped statements, or statements that
11872          are involved in patterns.  */
11873       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11874            !gsi_end_p (gsi); gsi_next (&gsi))
11875         {
11876           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11877           if (!call || !gimple_call_internal_p (call))
11878             continue;
11879           internal_fn ifn = gimple_call_internal_fn (call);
11880           if (ifn == IFN_MASK_LOAD)
11881             {
11882               tree lhs = gimple_get_lhs (call);
11883               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11884                 {
11885                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11886                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11887                   gsi_replace (&gsi, new_stmt, true);
11888                 }
11889             }
11890           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11891             {
11892               tree lhs = gimple_get_lhs (call);
11893               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11894                 {
11895                   tree else_arg
11896                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11897                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11898                   gsi_replace (&gsi, new_stmt, true);
11899                 }
11900             }
11901         }
11902     }                           /* BBs in loop */
11903
11904   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11905      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11906   if (integer_onep (step_vector))
11907     niters_no_overflow = true;
11908   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11909                            niters_vector, step_vector, niters_vector_mult_vf,
11910                            !niters_no_overflow);
11911
11912   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11913
11914   /* True if the final iteration might not handle a full vector's
11915      worth of scalar iterations.  */
11916   bool final_iter_may_be_partial
11917     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11918   /* The minimum number of iterations performed by the epilogue.  This
11919      is 1 when peeling for gaps because we always need a final scalar
11920      iteration.  */
11921   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11922   /* +1 to convert latch counts to loop iteration counts,
11923      -min_epilogue_iters to remove iterations that cannot be performed
11924        by the vector code.  */
11925   int bias_for_lowest = 1 - min_epilogue_iters;
11926   int bias_for_assumed = bias_for_lowest;
11927   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11928   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11929     {
11930       /* When the amount of peeling is known at compile time, the first
11931          iteration will have exactly alignment_npeels active elements.
11932          In the worst case it will have at least one.  */
11933       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11934       bias_for_lowest += lowest_vf - min_first_active;
11935       bias_for_assumed += assumed_vf - min_first_active;
11936     }
11937   /* In these calculations the "- 1" converts loop iteration counts
11938      back to latch counts.  */
11939   if (loop->any_upper_bound)
11940     {
11941       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11942       loop->nb_iterations_upper_bound
11943         = (final_iter_may_be_partial
11944            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11945                             lowest_vf) - 1
11946            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11947                              lowest_vf) - 1);
11948       if (main_vinfo
11949           /* Both peeling for alignment and peeling for gaps can end up
11950              with the scalar epilogue running for more than VF-1 iterations.  */
11951           && !main_vinfo->peeling_for_alignment
11952           && !main_vinfo->peeling_for_gaps)
11953         {
11954           unsigned int bound;
11955           poly_uint64 main_iters
11956             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11957                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11958           main_iters
11959             = upper_bound (main_iters,
11960                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11961           if (can_div_away_from_zero_p (main_iters,
11962                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11963                                         &bound))
11964             loop->nb_iterations_upper_bound
11965               = wi::umin ((bound_wide_int) (bound - 1),
11966                           loop->nb_iterations_upper_bound);
11967       }
11968   }
11969   if (loop->any_likely_upper_bound)
11970     loop->nb_iterations_likely_upper_bound
11971       = (final_iter_may_be_partial
11972          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11973                           + bias_for_lowest, lowest_vf) - 1
11974          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11975                            + bias_for_lowest, lowest_vf) - 1);
11976   if (loop->any_estimate)
11977     loop->nb_iterations_estimate
11978       = (final_iter_may_be_partial
11979          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11980                           assumed_vf) - 1
11981          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11982                            assumed_vf) - 1);
11983   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11984                                assumed_vf, flat);
11985
11986   if (dump_enabled_p ())
11987     {
11988       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11989         {
11990           dump_printf_loc (MSG_NOTE, vect_location,
11991                            "LOOP VECTORIZED\n");
11992           if (loop->inner)
11993             dump_printf_loc (MSG_NOTE, vect_location,
11994                              "OUTER LOOP VECTORIZED\n");
11995           dump_printf (MSG_NOTE, "\n");
11996         }
11997       else
11998         dump_printf_loc (MSG_NOTE, vect_location,
11999                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12000                          GET_MODE_NAME (loop_vinfo->vector_mode));
12001     }
12002
12003   /* Loops vectorized with a variable factor won't benefit from
12004      unrolling/peeling.  */
12005   if (!vf.is_constant ())
12006     {
12007       loop->unroll = 1;
12008       if (dump_enabled_p ())
12009         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12010                          " variable-length vectorization factor\n");
12011     }
12012   /* Free SLP instances here because otherwise stmt reference counting
12013      won't work.  */
12014   slp_instance instance;
12015   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12016     vect_free_slp_instance (instance);
12017   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12018   /* Clear-up safelen field since its value is invalid after vectorization
12019      since vectorized loop can have loop-carried dependencies.  */
12020   loop->safelen = 0;
12021
12022   if (epilogue)
12023     {
12024       update_epilogue_loop_vinfo (epilogue, advance);
12025
12026       epilogue->simduid = loop->simduid;
12027       epilogue->force_vectorize = loop->force_vectorize;
12028       epilogue->dont_vectorize = false;
12029     }
12030
12031   return epilogue;
12032 }
12033
12034 /* The code below is trying to perform simple optimization - revert
12035    if-conversion for masked stores, i.e. if the mask of a store is zero
12036    do not perform it and all stored value producers also if possible.
12037    For example,
12038      for (i=0; i<n; i++)
12039        if (c[i])
12040         {
12041           p1[i] += 1;
12042           p2[i] = p3[i] +2;
12043         }
12044    this transformation will produce the following semi-hammock:
12045
12046    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12047      {
12048        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12049        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12050        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12051        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12052        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12053        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12054      }
12055 */
12056
12057 void
12058 optimize_mask_stores (class loop *loop)
12059 {
12060   basic_block *bbs = get_loop_body (loop);
12061   unsigned nbbs = loop->num_nodes;
12062   unsigned i;
12063   basic_block bb;
12064   class loop *bb_loop;
12065   gimple_stmt_iterator gsi;
12066   gimple *stmt;
12067   auto_vec<gimple *> worklist;
12068   auto_purge_vect_location sentinel;
12069
12070   vect_location = find_loop_location (loop);
12071   /* Pick up all masked stores in loop if any.  */
12072   for (i = 0; i < nbbs; i++)
12073     {
12074       bb = bbs[i];
12075       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12076            gsi_next (&gsi))
12077         {
12078           stmt = gsi_stmt (gsi);
12079           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12080             worklist.safe_push (stmt);
12081         }
12082     }
12083
12084   free (bbs);
12085   if (worklist.is_empty ())
12086     return;
12087
12088   /* Loop has masked stores.  */
12089   while (!worklist.is_empty ())
12090     {
12091       gimple *last, *last_store;
12092       edge e, efalse;
12093       tree mask;
12094       basic_block store_bb, join_bb;
12095       gimple_stmt_iterator gsi_to;
12096       tree vdef, new_vdef;
12097       gphi *phi;
12098       tree vectype;
12099       tree zero;
12100
12101       last = worklist.pop ();
12102       mask = gimple_call_arg (last, 2);
12103       bb = gimple_bb (last);
12104       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12105          the same loop as if_bb.  It could be different to LOOP when two
12106          level loop-nest is vectorized and mask_store belongs to the inner
12107          one.  */
12108       e = split_block (bb, last);
12109       bb_loop = bb->loop_father;
12110       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12111       join_bb = e->dest;
12112       store_bb = create_empty_bb (bb);
12113       add_bb_to_loop (store_bb, bb_loop);
12114       e->flags = EDGE_TRUE_VALUE;
12115       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12116       /* Put STORE_BB to likely part.  */
12117       efalse->probability = profile_probability::likely ();
12118       e->probability = efalse->probability.invert ();
12119       store_bb->count = efalse->count ();
12120       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12121       if (dom_info_available_p (CDI_DOMINATORS))
12122         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12123       if (dump_enabled_p ())
12124         dump_printf_loc (MSG_NOTE, vect_location,
12125                          "Create new block %d to sink mask stores.",
12126                          store_bb->index);
12127       /* Create vector comparison with boolean result.  */
12128       vectype = TREE_TYPE (mask);
12129       zero = build_zero_cst (vectype);
12130       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12131       gsi = gsi_last_bb (bb);
12132       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12133       /* Create new PHI node for vdef of the last masked store:
12134          .MEM_2 = VDEF <.MEM_1>
12135          will be converted to
12136          .MEM.3 = VDEF <.MEM_1>
12137          and new PHI node will be created in join bb
12138          .MEM_2 = PHI <.MEM_1, .MEM_3>
12139       */
12140       vdef = gimple_vdef (last);
12141       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12142       gimple_set_vdef (last, new_vdef);
12143       phi = create_phi_node (vdef, join_bb);
12144       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12145
12146       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12147       while (true)
12148         {
12149           gimple_stmt_iterator gsi_from;
12150           gimple *stmt1 = NULL;
12151
12152           /* Move masked store to STORE_BB.  */
12153           last_store = last;
12154           gsi = gsi_for_stmt (last);
12155           gsi_from = gsi;
12156           /* Shift GSI to the previous stmt for further traversal.  */
12157           gsi_prev (&gsi);
12158           gsi_to = gsi_start_bb (store_bb);
12159           gsi_move_before (&gsi_from, &gsi_to);
12160           /* Setup GSI_TO to the non-empty block start.  */
12161           gsi_to = gsi_start_bb (store_bb);
12162           if (dump_enabled_p ())
12163             dump_printf_loc (MSG_NOTE, vect_location,
12164                              "Move stmt to created bb\n%G", last);
12165           /* Move all stored value producers if possible.  */
12166           while (!gsi_end_p (gsi))
12167             {
12168               tree lhs;
12169               imm_use_iterator imm_iter;
12170               use_operand_p use_p;
12171               bool res;
12172
12173               /* Skip debug statements.  */
12174               if (is_gimple_debug (gsi_stmt (gsi)))
12175                 {
12176                   gsi_prev (&gsi);
12177                   continue;
12178                 }
12179               stmt1 = gsi_stmt (gsi);
12180               /* Do not consider statements writing to memory or having
12181                  volatile operand.  */
12182               if (gimple_vdef (stmt1)
12183                   || gimple_has_volatile_ops (stmt1))
12184                 break;
12185               gsi_from = gsi;
12186               gsi_prev (&gsi);
12187               lhs = gimple_get_lhs (stmt1);
12188               if (!lhs)
12189                 break;
12190
12191               /* LHS of vectorized stmt must be SSA_NAME.  */
12192               if (TREE_CODE (lhs) != SSA_NAME)
12193                 break;
12194
12195               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12196                 {
12197                   /* Remove dead scalar statement.  */
12198                   if (has_zero_uses (lhs))
12199                     {
12200                       gsi_remove (&gsi_from, true);
12201                       continue;
12202                     }
12203                 }
12204
12205               /* Check that LHS does not have uses outside of STORE_BB.  */
12206               res = true;
12207               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12208                 {
12209                   gimple *use_stmt;
12210                   use_stmt = USE_STMT (use_p);
12211                   if (is_gimple_debug (use_stmt))
12212                     continue;
12213                   if (gimple_bb (use_stmt) != store_bb)
12214                     {
12215                       res = false;
12216                       break;
12217                     }
12218                 }
12219               if (!res)
12220                 break;
12221
12222               if (gimple_vuse (stmt1)
12223                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12224                 break;
12225
12226               /* Can move STMT1 to STORE_BB.  */
12227               if (dump_enabled_p ())
12228                 dump_printf_loc (MSG_NOTE, vect_location,
12229                                  "Move stmt to created bb\n%G", stmt1);
12230               gsi_move_before (&gsi_from, &gsi_to);
12231               /* Shift GSI_TO for further insertion.  */
12232               gsi_prev (&gsi_to);
12233             }
12234           /* Put other masked stores with the same mask to STORE_BB.  */
12235           if (worklist.is_empty ()
12236               || gimple_call_arg (worklist.last (), 2) != mask
12237               || worklist.last () != stmt1)
12238             break;
12239           last = worklist.pop ();
12240         }
12241       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12242     }
12243 }
12244
12245 /* Decide whether it is possible to use a zero-based induction variable
12246    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12247    the value that the induction variable must be able to hold in order
12248    to ensure that the rgroups eventually have no active vector elements.
12249    Return -1 otherwise.  */
12250
12251 widest_int
12252 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12253 {
12254   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12255   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12256   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12257
12258   /* Calculate the value that the induction variable must be able
12259      to hit in order to ensure that we end the loop with an all-false mask.
12260      This involves adding the maximum number of inactive trailing scalar
12261      iterations.  */
12262   widest_int iv_limit = -1;
12263   if (max_loop_iterations (loop, &iv_limit))
12264     {
12265       if (niters_skip)
12266         {
12267           /* Add the maximum number of skipped iterations to the
12268              maximum iteration count.  */
12269           if (TREE_CODE (niters_skip) == INTEGER_CST)
12270             iv_limit += wi::to_widest (niters_skip);
12271           else
12272             iv_limit += max_vf - 1;
12273         }
12274       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12275         /* Make a conservatively-correct assumption.  */
12276         iv_limit += max_vf - 1;
12277
12278       /* IV_LIMIT is the maximum number of latch iterations, which is also
12279          the maximum in-range IV value.  Round this value down to the previous
12280          vector alignment boundary and then add an extra full iteration.  */
12281       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12282       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12283     }
12284   return iv_limit;
12285 }
12286
12287 /* For the given rgroup_controls RGC, check whether an induction variable
12288    would ever hit a value that produces a set of all-false masks or zero
12289    lengths before wrapping around.  Return true if it's possible to wrap
12290    around before hitting the desirable value, otherwise return false.  */
12291
12292 bool
12293 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12294 {
12295   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12296
12297   if (iv_limit == -1)
12298     return true;
12299
12300   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12301   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12302   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12303
12304   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12305     return true;
12306
12307   return false;
12308 }