gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     {
 670       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 671       while (next)
 672         {
 673           if ((STMT_VINFO_IN_PATTERN_P (next)
 674                != STMT_VINFO_IN_PATTERN_P (first))
 675               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 676             break;
 677           next = REDUC_GROUP_NEXT_ELEMENT (next);
 678         }
 679       /* If all reduction chain members are well-formed patterns adjust
 680          the group to group the pattern stmts instead.  */
 681       if (! next
 682           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 683         {
 684           if (STMT_VINFO_IN_PATTERN_P (first))
 685             {
 686               vect_fixup_reduc_chain (first);
 687               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 688                 = STMT_VINFO_RELATED_STMT (first);
 689             }
 690         }
 691       /* If not all stmt in the chain are patterns or if we failed
 692          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 693          it as regular reduction instead.  */
 694       else
 695         {
 696           stmt_vec_info vinfo = first;
 697           stmt_vec_info last = NULL;
 698           while (vinfo)
 699             {
 700               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 701               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 702               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 703               last = vinfo;
 704               vinfo = next;
 705             }
 706           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 707             = vect_internal_def;
 708           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 709           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 710           --i;
 711         }
 712     }
 713 }
 714
 715 /* Function vect_get_loop_niters.
 716
 717    Determine how many iterations the loop is executed and place it
 718    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 719    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 720    niter information holds in ASSUMPTIONS.
 721
 722    Return the loop exit condition.  */
 723
 724
 725 static gcond *
 726 vect_get_loop_niters (class loop *loop, tree *assumptions,
 727                       tree *number_of_iterations, tree *number_of_iterationsm1)
 728 {
 729   edge exit = single_exit (loop);
 730   class tree_niter_desc niter_desc;
 731   tree niter_assumptions, niter, may_be_zero;
 732   gcond *cond = get_loop_exit_condition (loop);
 733
 734   *assumptions = boolean_true_node;
 735   *number_of_iterationsm1 = chrec_dont_know;
 736   *number_of_iterations = chrec_dont_know;
 737   DUMP_VECT_SCOPE ("get_loop_niters");
 738
 739   if (!exit)
 740     return cond;
 741
 742   may_be_zero = NULL_TREE;
 743   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 744       || chrec_contains_undetermined (niter_desc.niter))
 745     return cond;
 746
 747   niter_assumptions = niter_desc.assumptions;
 748   may_be_zero = niter_desc.may_be_zero;
 749   niter = niter_desc.niter;
 750
 751   if (may_be_zero && integer_zerop (may_be_zero))
 752     may_be_zero = NULL_TREE;
 753
 754   if (may_be_zero)
 755     {
 756       if (COMPARISON_CLASS_P (may_be_zero))
 757         {
 758           /* Try to combine may_be_zero with assumptions, this can simplify
 759              computation of niter expression.  */
 760           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 761             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 762                                              niter_assumptions,
 763                                              fold_build1 (TRUTH_NOT_EXPR,
 764                                                           boolean_type_node,
 765                                                           may_be_zero));
 766           else
 767             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 768                                  build_int_cst (TREE_TYPE (niter), 0),
 769                                  rewrite_to_non_trapping_overflow (niter));
 770
 771           may_be_zero = NULL_TREE;
 772         }
 773       else if (integer_nonzerop (may_be_zero))
 774         {
 775           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 776           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 777           return cond;
 778         }
 779       else
 780         return cond;
 781     }
 782
 783   *assumptions = niter_assumptions;
 784   *number_of_iterationsm1 = niter;
 785
 786   /* We want the number of loop header executions which is the number
 787      of latch executions plus one.
 788      ???  For UINT_MAX latch executions this number overflows to zero
 789      for loops like do { n++; } while (n != 0);  */
 790   if (niter && !chrec_contains_undetermined (niter))
 791     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 792                           build_int_cst (TREE_TYPE (niter), 1));
 793   *number_of_iterations = niter;
 794
 795   return cond;
 796 }
 797
 798 /* Function bb_in_loop_p
 799
 800    Used as predicate for dfs order traversal of the loop bbs.  */
 801
 802 static bool
 803 bb_in_loop_p (const_basic_block bb, const void *data)
 804 {
 805   const class loop *const loop = (const class loop *)data;
 806   if (flow_bb_inside_loop_p (loop, bb))
 807     return true;
 808   return false;
 809 }
 810
 811
 812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 813    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 814
 815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 816   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 817     loop (loop_in),
 818     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 819     num_itersm1 (NULL_TREE),
 820     num_iters (NULL_TREE),
 821     num_iters_unchanged (NULL_TREE),
 822     num_iters_assumptions (NULL_TREE),
 823     th (0),
 824     versioning_threshold (0),
 825     vectorization_factor (0),
 826     max_vectorization_factor (0),
 827     mask_skip_niters (NULL_TREE),
 828     rgroup_compare_type (NULL_TREE),
 829     simd_if_cond (NULL_TREE),
 830     unaligned_dr (NULL),
 831     peeling_for_alignment (0),
 832     ptr_mask (0),
 833     ivexpr_map (NULL),
 834     scan_map (NULL),
 835     slp_unrolling_factor (1),
 836     single_scalar_iteration_cost (0),
 837     vec_outside_cost (0),
 838     vec_inside_cost (0),
 839     vectorizable (false),
 840     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 841     using_partial_vectors_p (false),
 842     epil_using_partial_vectors_p (false),
 843     peeling_for_gaps (false),
 844     peeling_for_niter (false),
 845     no_data_dependencies (false),
 846     has_mask_store (false),
 847     scalar_loop_scaling (profile_probability::uninitialized ()),
 848     scalar_loop (NULL),
 849     orig_loop_info (NULL)
 850 {
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 857                                           bbs, loop->num_nodes, loop);
 858   gcc_assert (nbbs == loop->num_nodes);
 859
 860   for (unsigned int i = 0; i < nbbs; i++)
 861     {
 862       basic_block bb = bbs[i];
 863       gimple_stmt_iterator si;
 864
 865       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 866         {
 867           gimple *phi = gsi_stmt (si);
 868           gimple_set_uid (phi, 0);
 869           add_stmt (phi);
 870         }
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 873         {
 874           gimple *stmt = gsi_stmt (si);
 875           gimple_set_uid (stmt, 0);
 876           if (is_gimple_debug (stmt))
 877             continue;
 878           add_stmt (stmt);
 879           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 880              third argument is the #pragma omp simd if (x) condition, when 0,
 881              loop shouldn't be vectorized, when non-zero constant, it should
 882              be vectorized normally, otherwise versioned with vectorized loop
 883              done if the condition is non-zero at runtime.  */
 884           if (loop_in->simduid
 885               && is_gimple_call (stmt)
 886               && gimple_call_internal_p (stmt)
 887               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 888               && gimple_call_num_args (stmt) >= 3
 889               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 890               && (loop_in->simduid
 891                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 892             {
 893               tree arg = gimple_call_arg (stmt, 2);
 894               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 895                 simd_if_cond = arg;
 896               else
 897                 gcc_assert (integer_nonzerop (arg));
 898             }
 899         }
 900     }
 901
 902   epilogue_vinfos.create (6);
 903 }
 904
 905 /* Free all levels of rgroup CONTROLS.  */
 906
 907 void
 908 release_vec_loop_controls (vec<rgroup_controls> *controls)
 909 {
 910   rgroup_controls *rgc;
 911   unsigned int i;
 912   FOR_EACH_VEC_ELT (*controls, i, rgc)
 913     rgc->controls.release ();
 914   controls->release ();
 915 }
 916
 917 /* Free all memory used by the _loop_vec_info, as well as all the
 918    stmt_vec_info structs of all the stmts in the loop.  */
 919
 920 _loop_vec_info::~_loop_vec_info ()
 921 {
 922   free (bbs);
 923
 924   release_vec_loop_controls (&masks);
 925   release_vec_loop_controls (&lens);
 926   delete ivexpr_map;
 927   delete scan_map;
 928   epilogue_vinfos.release ();
 929
 930   loop->aux = NULL;
 931 }
 932
 933 /* Return an invariant or register for EXPR and emit necessary
 934    computations in the LOOP_VINFO loop preheader.  */
 935
 936 tree
 937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 938 {
 939   if (is_gimple_reg (expr)
 940       || is_gimple_min_invariant (expr))
 941     return expr;
 942
 943   if (! loop_vinfo->ivexpr_map)
 944     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 945   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 946   if (! cached)
 947     {
 948       gimple_seq stmts = NULL;
 949       cached = force_gimple_operand (unshare_expr (expr),
 950                                      &stmts, true, NULL_TREE);
 951       if (stmts)
 952         {
 953           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 954           gsi_insert_seq_on_edge_immediate (e, stmts);
 955         }
 956     }
 957   return cached;
 958 }
 959
 960 /* Return true if we can use CMP_TYPE as the comparison type to produce
 961    all masks required to mask LOOP_VINFO.  */
 962
 963 static bool
 964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 965 {
 966   rgroup_controls *rgm;
 967   unsigned int i;
 968   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 969     if (rgm->type != NULL_TREE
 970         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 971                                             cmp_type, rgm->type,
 972                                             OPTIMIZE_FOR_SPEED))
 973       return false;
 974   return true;
 975 }
 976
 977 /* Calculate the maximum number of scalars per iteration for every
 978    rgroup in LOOP_VINFO.  */
 979
 980 static unsigned int
 981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 982 {
 983   unsigned int res = 1;
 984   unsigned int i;
 985   rgroup_controls *rgm;
 986   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 987     res = MAX (res, rgm->max_nscalars_per_iter);
 988   return res;
 989 }
 990
 991 /* Calculate the minimum precision necessary to represent:
 992
 993       MAX_NITERS * FACTOR
 994
 995    as an unsigned integer, where MAX_NITERS is the maximum number of
 996    loop header iterations for the original scalar form of LOOP_VINFO.  */
 997
 998 static unsigned
 999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1000 {
1001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1002
1003   /* Get the maximum number of iterations that is representable
1004      in the counter type.  */
1005   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1007
1008   /* Get a more refined estimate for the number of iterations.  */
1009   widest_int max_back_edges;
1010   if (max_loop_iterations (loop, &max_back_edges))
1011     max_ni = wi::smin (max_ni, max_back_edges + 1);
1012
1013   /* Work out how many bits we need to represent the limit.  */
1014   return wi::min_precision (max_ni * factor, UNSIGNED);
1015 }
1016
1017 /* True if the loop needs peeling or partial vectors when vectorized.  */
1018
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1021 {
1022   unsigned HOST_WIDE_INT const_vf;
1023   HOST_WIDE_INT max_niter
1024     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1025
1026   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029                                           (loop_vinfo));
1030
1031   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1033     {
1034       /* Work out the (constant) number of iterations that need to be
1035          peeled for reasons other than niters.  */
1036       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038         peel_niter += 1;
1039       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041         return true;
1042     }
1043   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044       /* ??? When peeling for gaps but not alignment, we could
1045          try to check whether the (variable) niters is known to be
1046          VF * N + 1.  That's something of a niche case though.  */
1047       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050            < (unsigned) exact_log2 (const_vf))
1051           /* In case of versioning, check if the maximum number of
1052              iterations is greater than th.  If they are identical,
1053              the epilogue is unnecessary.  */
1054           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055               || ((unsigned HOST_WIDE_INT) max_niter
1056                   > (th / const_vf) * const_vf))))
1057     return true;
1058
1059   return false;
1060 }
1061
1062 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1063    whether we can actually generate the masks required.  Return true if so,
1064    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1065
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1068 {
1069   unsigned int min_ni_width;
1070   unsigned int max_nscalars_per_iter
1071     = vect_get_max_nscalars_per_iter (loop_vinfo);
1072
1073   /* Use a normal loop if there are no statements that need masking.
1074      This only happens in rare degenerate cases: it means that the loop
1075      has no loads, no stores, and no live-out values.  */
1076   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077     return false;
1078
1079   /* Work out how many bits we need to represent the limit.  */
1080   min_ni_width
1081     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1082
1083   /* Find a scalar mode for which WHILE_ULT is supported.  */
1084   opt_scalar_int_mode cmp_mode_iter;
1085   tree cmp_type = NULL_TREE;
1086   tree iv_type = NULL_TREE;
1087   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088   unsigned int iv_precision = UINT_MAX;
1089
1090   if (iv_limit != -1)
1091     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092                                       UNSIGNED);
1093
1094   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1095     {
1096       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097       if (cmp_bits >= min_ni_width
1098           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1099         {
1100           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101           if (this_type
1102               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1103             {
1104               /* Although we could stop as soon as we find a valid mode,
1105                  there are at least two reasons why that's not always the
1106                  best choice:
1107
1108                  - An IV that's Pmode or wider is more likely to be reusable
1109                    in address calculations than an IV that's narrower than
1110                    Pmode.
1111
1112                  - Doing the comparison in IV_PRECISION or wider allows
1113                    a natural 0-based IV, whereas using a narrower comparison
1114                    type requires mitigations against wrap-around.
1115
1116                  Conversely, if the IV limit is variable, doing the comparison
1117                  in a wider type than the original type can introduce
1118                  unnecessary extensions, so picking the widest valid mode
1119                  is not always a good choice either.
1120
1121                  Here we prefer the first IV type that's Pmode or wider,
1122                  and the first comparison type that's IV_PRECISION or wider.
1123                  (The comparison type must be no wider than the IV type,
1124                  to avoid extensions in the vector loop.)
1125
1126                  ??? We might want to try continuing beyond Pmode for ILP32
1127                  targets if CMP_BITS < IV_PRECISION.  */
1128               iv_type = this_type;
1129               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130                 cmp_type = this_type;
1131               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132                 break;
1133             }
1134         }
1135     }
1136
1137   if (!cmp_type)
1138     return false;
1139
1140   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142   return true;
1143 }
1144
1145 /* Check whether we can use vector access with length based on precison
1146    comparison.  So far, to keep it simple, we only allow the case that the
1147    precision of the target supported length is larger than the precision
1148    required by loop niters.  */
1149
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1152 {
1153   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154     return false;
1155
1156   unsigned int max_nitems_per_iter = 1;
1157   unsigned int i;
1158   rgroup_controls *rgl;
1159   /* Find the maximum number of items per iteration for every rgroup.  */
1160   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1161     {
1162       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1164     }
1165
1166   /* Work out how many bits we need to represent the length limit.  */
1167   unsigned int min_ni_prec
1168     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1169
1170   /* Now use the maximum of below precisions for one suitable IV type:
1171      - the IV's natural precision
1172      - the precision needed to hold: the maximum number of scalar
1173        iterations multiplied by the scale factor (min_ni_prec above)
1174      - the Pmode precision
1175
1176      If min_ni_prec is less than the precision of the current niters,
1177      we perfer to still use the niters type.  Prefer to use Pmode and
1178      wider IV to avoid narrow conversions.  */
1179
1180   unsigned int ni_prec
1181     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182   min_ni_prec = MAX (min_ni_prec, ni_prec);
1183   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1184
1185   tree iv_type = NULL_TREE;
1186   opt_scalar_int_mode tmode_iter;
1187   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1188     {
1189       scalar_mode tmode = tmode_iter.require ();
1190       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1191
1192       /* ??? Do we really want to construct one IV whose precision exceeds
1193          BITS_PER_WORD?  */
1194       if (tbits > BITS_PER_WORD)
1195         break;
1196
1197       /* Find the first available standard integral type.  */
1198       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1199         {
1200           iv_type = build_nonstandard_integer_type (tbits, true);
1201           break;
1202         }
1203     }
1204
1205   if (!iv_type)
1206     {
1207       if (dump_enabled_p ())
1208         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                          "can't vectorize with length-based partial vectors"
1210                          " because there is no suitable iv type.\n");
1211       return false;
1212     }
1213
1214   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1216
1217   return true;
1218 }
1219
1220 /* Calculate the cost of one scalar iteration of the loop.  */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1223 {
1224   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226   int nbbs = loop->num_nodes, factor;
1227   int innerloop_iters, i;
1228
1229   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1230
1231   /* Gather costs for statements in the scalar loop.  */
1232
1233   /* FORNOW.  */
1234   innerloop_iters = 1;
1235   if (loop->inner)
1236     innerloop_iters = 50; /* FIXME */
1237
1238   for (i = 0; i < nbbs; i++)
1239     {
1240       gimple_stmt_iterator si;
1241       basic_block bb = bbs[i];
1242
1243       if (bb->loop_father == loop->inner)
1244         factor = innerloop_iters;
1245       else
1246         factor = 1;
1247
1248       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249         {
1250           gimple *stmt = gsi_stmt (si);
1251           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1252
1253           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254             continue;
1255
1256           /* Skip stmts that are not vectorized inside the loop.  */
1257           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259               && (!STMT_VINFO_LIVE_P (vstmt_info)
1260                   || !VECTORIZABLE_CYCLE_DEF
1261                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262             continue;
1263
1264           vect_cost_for_stmt kind;
1265           if (STMT_VINFO_DATA_REF (stmt_info))
1266             {
1267               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268                kind = scalar_load;
1269              else
1270                kind = scalar_store;
1271             }
1272           else if (vect_nop_conversion_p (stmt_info))
1273             continue;
1274           else
1275             kind = scalar_stmt;
1276
1277           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278                             factor, kind, stmt_info, 0, vect_prologue);
1279         }
1280     }
1281
1282   /* Now accumulate cost.  */
1283   void *target_cost_data = init_cost (loop);
1284   stmt_info_for_cost *si;
1285   int j;
1286   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287                     j, si)
1288     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289                           si->kind, si->stmt_info, si->vectype,
1290                           si->misalign, vect_body);
1291   unsigned dummy, body_cost = 0;
1292   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293   destroy_cost_data (target_cost_data);
1294   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1295 }
1296
1297
1298 /* Function vect_analyze_loop_form_1.
1299
1300    Verify that certain CFG restrictions hold, including:
1301    - the loop has a pre-header
1302    - the loop has a single entry and exit
1303    - the loop exit condition is simple enough
1304    - the number of iterations can be analyzed, i.e, a countable loop.  The
1305      niter could be analyzed under some assumptions.  */
1306
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309                           tree *assumptions, tree *number_of_iterationsm1,
1310                           tree *number_of_iterations, gcond **inner_loop_cond)
1311 {
1312   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1313
1314   /* Different restrictions apply when we are considering an inner-most loop,
1315      vs. an outer (nested) loop.
1316      (FORNOW. May want to relax some of these restrictions in the future).  */
1317
1318   if (!loop->inner)
1319     {
1320       /* Inner-most loop.  We currently require that the number of BBs is
1321          exactly 2 (the header and latch).  Vectorizable inner-most loops
1322          look like this:
1323
1324                         (pre-header)
1325                            |
1326                           header <--------+
1327                            | |            |
1328                            | +--> latch --+
1329                            |
1330                         (exit-bb)  */
1331
1332       if (loop->num_nodes != 2)
1333         return opt_result::failure_at (vect_location,
1334                                        "not vectorized:"
1335                                        " control flow in loop.\n");
1336
1337       if (empty_block_p (loop->header))
1338         return opt_result::failure_at (vect_location,
1339                                        "not vectorized: empty loop.\n");
1340     }
1341   else
1342     {
1343       class loop *innerloop = loop->inner;
1344       edge entryedge;
1345
1346       /* Nested loop. We currently require that the loop is doubly-nested,
1347          contains a single inner loop, and the number of BBs is exactly 5.
1348          Vectorizable outer-loops look like this:
1349
1350                         (pre-header)
1351                            |
1352                           header <---+
1353                            |         |
1354                           inner-loop |
1355                            |         |
1356                           tail ------+
1357                            |
1358                         (exit-bb)
1359
1360          The inner-loop has the properties expected of inner-most loops
1361          as described above.  */
1362
1363       if ((loop->inner)->inner || (loop->inner)->next)
1364         return opt_result::failure_at (vect_location,
1365                                        "not vectorized:"
1366                                        " multiple nested loops.\n");
1367
1368       if (loop->num_nodes != 5)
1369         return opt_result::failure_at (vect_location,
1370                                        "not vectorized:"
1371                                        " control flow in loop.\n");
1372
1373       entryedge = loop_preheader_edge (innerloop);
1374       if (entryedge->src != loop->header
1375           || !single_exit (innerloop)
1376           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377         return opt_result::failure_at (vect_location,
1378                                        "not vectorized:"
1379                                        " unsupported outerloop form.\n");
1380
1381       /* Analyze the inner-loop.  */
1382       tree inner_niterm1, inner_niter, inner_assumptions;
1383       opt_result res
1384         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385                                     &inner_assumptions, &inner_niterm1,
1386                                     &inner_niter, NULL);
1387       if (!res)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: Bad inner loop.\n");
1392           return res;
1393         }
1394
1395       /* Don't support analyzing niter under assumptions for inner
1396          loop.  */
1397       if (!integer_onep (inner_assumptions))
1398         return opt_result::failure_at (vect_location,
1399                                        "not vectorized: Bad inner loop.\n");
1400
1401       if (!expr_invariant_in_loop_p (loop, inner_niter))
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized: inner-loop count not"
1404                                        " invariant.\n");
1405
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Considering outer-loop vectorization.\n");
1409     }
1410
1411   if (!single_exit (loop))
1412     return opt_result::failure_at (vect_location,
1413                                    "not vectorized: multiple exits.\n");
1414   if (EDGE_COUNT (loop->header->preds) != 2)
1415     return opt_result::failure_at (vect_location,
1416                                    "not vectorized:"
1417                                    " too many incoming edges.\n");
1418
1419   /* We assume that the loop exit condition is at the end of the loop. i.e,
1420      that the loop is represented as a do-while (with a proper if-guard
1421      before the loop if needed), where the loop header contains all the
1422      executable statements, and the latch is empty.  */
1423   if (!empty_block_p (loop->latch)
1424       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425     return opt_result::failure_at (vect_location,
1426                                    "not vectorized: latch block not empty.\n");
1427
1428   /* Make sure the exit is not abnormal.  */
1429   edge e = single_exit (loop);
1430   if (e->flags & EDGE_ABNORMAL)
1431     return opt_result::failure_at (vect_location,
1432                                    "not vectorized:"
1433                                    " abnormal loop exit edge.\n");
1434
1435   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436                                      number_of_iterationsm1);
1437   if (!*loop_cond)
1438     return opt_result::failure_at
1439       (vect_location,
1440        "not vectorized: complicated exit condition.\n");
1441
1442   if (integer_zerop (*assumptions)
1443       || !*number_of_iterations
1444       || chrec_contains_undetermined (*number_of_iterations))
1445     return opt_result::failure_at
1446       (*loop_cond,
1447        "not vectorized: number of iterations cannot be computed.\n");
1448
1449   if (integer_zerop (*number_of_iterations))
1450     return opt_result::failure_at
1451       (*loop_cond,
1452        "not vectorized: number of iterations = 0.\n");
1453
1454   return opt_result::success ();
1455 }
1456
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1458
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1461 {
1462   tree assumptions, number_of_iterations, number_of_iterationsm1;
1463   gcond *loop_cond, *inner_loop_cond = NULL;
1464
1465   opt_result res
1466     = vect_analyze_loop_form_1 (loop, &loop_cond,
1467                                 &assumptions, &number_of_iterationsm1,
1468                                 &number_of_iterations, &inner_loop_cond);
1469   if (!res)
1470     return opt_loop_vec_info::propagate_failure (res);
1471
1472   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476   if (!integer_onep (assumptions))
1477     {
1478       /* We consider to vectorize this loop by versioning it under
1479          some assumptions.  In order to do this, we need to clear
1480          existing information computed by scev and niter analyzer.  */
1481       scev_reset_htab ();
1482       free_numbers_of_iterations_estimates (loop);
1483       /* Also set flag for this loop so that following scev and niter
1484          analysis are done under the assumptions.  */
1485       loop_constraint_set (loop, LOOP_C_FINITE);
1486       /* Also record the assumptions for versioning.  */
1487       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1488     }
1489
1490   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1491     {
1492       if (dump_enabled_p ())
1493         {
1494           dump_printf_loc (MSG_NOTE, vect_location,
1495                            "Symbolic number of iterations is ");
1496           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497           dump_printf (MSG_NOTE, "\n");
1498         }
1499     }
1500
1501   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503   if (inner_loop_cond)
1504     {
1505       stmt_vec_info inner_loop_cond_info
1506         = loop_vinfo->lookup_stmt (inner_loop_cond);
1507       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508     }
1509
1510   gcc_assert (!loop->aux);
1511   loop->aux = loop_vinfo;
1512   return opt_loop_vec_info::success (loop_vinfo);
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518    statements update the vectorization factor.  */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525   int nbbs = loop->num_nodes;
1526   poly_uint64 vectorization_factor;
1527   int i;
1528
1529   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532   gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535      vectorization factor of the loop is the unrolling factor required by
1536      the SLP instances.  If that unrolling factor is 1, we say, that we
1537      perform pure SLP on loop - cross iteration parallelism is not
1538      exploited.  */
1539   bool only_slp_in_loop = true;
1540   for (i = 0; i < nbbs; i++)
1541     {
1542       basic_block bb = bbs[i];
1543       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544            gsi_next (&si))
1545         {
1546           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547           if (!stmt_info)
1548             continue;
1549           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551               && !PURE_SLP_STMT (stmt_info))
1552             /* STMT needs both SLP and loop-based vectorization.  */
1553             only_slp_in_loop = false;
1554         }
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           if (is_gimple_debug (gsi_stmt (si)))
1559             continue;
1560           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561           stmt_info = vect_stmt_to_vectorize (stmt_info);
1562           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564               && !PURE_SLP_STMT (stmt_info))
1565             /* STMT needs both SLP and loop-based vectorization.  */
1566             only_slp_in_loop = false;
1567         }
1568     }
1569
1570   if (only_slp_in_loop)
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "Loop contains only SLP stmts\n");
1575       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576     }
1577   else
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "Loop contains SLP and non-SLP stmts\n");
1582       /* Both the vectorization factor and unroll factor have the form
1583          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584          so they must have a common multiple.  */
1585       vectorization_factor
1586         = force_common_multiple (vectorization_factor,
1587                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588     }
1589
1590   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591   if (dump_enabled_p ())
1592     {
1593       dump_printf_loc (MSG_NOTE, vect_location,
1594                        "Updating vectorization factor to ");
1595       dump_dec (MSG_NOTE, vectorization_factor);
1596       dump_printf (MSG_NOTE, ".\n");
1597     }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601    the other phi in the reduction is also relevant for vectorization.
1602    This rejects cases such as:
1603
1604       outer1:
1605         x_1 = PHI <x_3(outer2), ...>;
1606         ...
1607
1608       inner:
1609         x_2 = ...;
1610         ...
1611
1612       outer2:
1613         x_3 = PHI <x_2(inner)>;
1614
1615    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621     return false;
1622
1623   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628    Scan the loop stmts and make sure they are all vectorizable.  */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635   int nbbs = loop->num_nodes;
1636   int i;
1637   stmt_vec_info stmt_info;
1638   bool need_to_vectorize = false;
1639   bool ok;
1640
1641   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643   auto_vec<stmt_info_for_cost> cost_vec;
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       basic_block bb = bbs[i];
1648
1649       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650            gsi_next (&si))
1651         {
1652           gphi *phi = si.phi ();
1653           ok = true;
1654
1655           stmt_info = loop_vinfo->lookup_stmt (phi);
1656           if (dump_enabled_p ())
1657             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && !vect_active_double_reduction_p (stmt_info))
1671                 return opt_result::failure_at (phi,
1672                                                "Unsupported loop-closed phi"
1673                                                " in outer-loop.\n");
1674
1675               /* If PHI is used in the outer loop, we check that its operand
1676                  is defined in the inner loop.  */
1677               if (STMT_VINFO_RELEVANT_P (stmt_info))
1678                 {
1679                   tree phi_op;
1680
1681                   if (gimple_phi_num_args (phi) != 1)
1682                     return opt_result::failure_at (phi, "unsupported phi");
1683
1684                   phi_op = PHI_ARG_DEF (phi, 0);
1685                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686                   if (!op_def_info)
1687                     return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690                       && (STMT_VINFO_RELEVANT (op_def_info)
1691                           != vect_used_in_outer_by_reduction))
1692                     return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1696                            == vect_double_reduction_def))
1697                       && !vectorizable_lc_phi (loop_vinfo,
1698                                                stmt_info, NULL, NULL))
1699                     return opt_result::failure_at (phi, "unsupported phi\n");
1700                 }
1701
1702               continue;
1703             }
1704
1705           gcc_assert (stmt_info);
1706
1707           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708                || STMT_VINFO_LIVE_P (stmt_info))
1709               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710             /* A scalar-dependence cycle that we don't support.  */
1711             return opt_result::failure_at (phi,
1712                                            "not vectorized:"
1713                                            " scalar dependence cycle.\n");
1714
1715           if (STMT_VINFO_RELEVANT_P (stmt_info))
1716             {
1717               need_to_vectorize = true;
1718               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719                   && ! PURE_SLP_STMT (stmt_info))
1720                 ok = vectorizable_induction (loop_vinfo,
1721                                              stmt_info, NULL, NULL,
1722                                              &cost_vec);
1723               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1725                             == vect_double_reduction_def)
1726                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727                        && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_reduction (loop_vinfo,
1729                                              stmt_info, NULL, NULL, &cost_vec);
1730             }
1731
1732           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1733           if (ok
1734               && STMT_VINFO_LIVE_P (stmt_info)
1735               && !PURE_SLP_STMT (stmt_info))
1736             ok = vectorizable_live_operation (loop_vinfo,
1737                                               stmt_info, NULL, NULL, NULL,
1738                                               -1, false, &cost_vec);
1739
1740           if (!ok)
1741             return opt_result::failure_at (phi,
1742                                            "not vectorized: relevant phi not "
1743                                            "supported: %G",
1744                                            static_cast <gimple *> (phi));
1745         }
1746
1747       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748            gsi_next (&si))
1749         {
1750           gimple *stmt = gsi_stmt (si);
1751           if (!gimple_clobber_p (stmt)
1752               && !is_gimple_debug (stmt))
1753             {
1754               opt_result res
1755                 = vect_analyze_stmt (loop_vinfo,
1756                                      loop_vinfo->lookup_stmt (stmt),
1757                                      &need_to_vectorize,
1758                                      NULL, NULL, &cost_vec);
1759               if (!res)
1760                 return res;
1761             }
1762         }
1763     } /* bbs */
1764
1765   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1766
1767   /* All operations in the loop are either irrelevant (deal with loop
1768      control, or dead), or only used outside the loop and can be moved
1769      out of the loop (e.g. invariants, inductions).  The loop can be
1770      optimized away by scalar optimizations.  We're better off not
1771      touching this loop.  */
1772   if (!need_to_vectorize)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "All the computation can be taken out of the loop.\n");
1777       return opt_result::failure_at
1778         (vect_location,
1779          "not vectorized: redundant loop. no profit to vectorize.\n");
1780     }
1781
1782   return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786    vectorization factor.  Return false if it isn't, or if we can't be sure
1787    either way.  */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794   HOST_WIDE_INT max_niter;
1795   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797   else
1798     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1807    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1808    definitely no, or -1 if it's worth retrying.  */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816   /* Only loops that can handle partially-populated vectors can have iteration
1817      counts less than the vectorization factor.  */
1818   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819     {
1820       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: iteration count smaller than "
1825                              "vectorization factor.\n");
1826           return 0;
1827         }
1828     }
1829
1830   int min_profitable_iters, min_profitable_estimate;
1831   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1832                                       &min_profitable_estimate);
1833
1834   if (min_profitable_iters < 0)
1835     {
1836       if (dump_enabled_p ())
1837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                          "not vectorized: vectorization not profitable.\n");
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841                          "not vectorized: vector version will never be "
1842                          "profitable.\n");
1843       return -1;
1844     }
1845
1846   int min_scalar_loop_bound = (param_min_vect_loop_bound
1847                                * assumed_vf);
1848
1849   /* Use the cost model only if it is more conservative than user specified
1850      threshold.  */
1851   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1852                                     min_profitable_iters);
1853
1854   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1855
1856   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1857       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "not vectorized: vectorization not profitable.\n");
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_NOTE, vect_location,
1864                          "not vectorized: iteration count smaller than user "
1865                          "specified loop bound parameter or minimum profitable "
1866                          "iterations (whichever is more conservative).\n");
1867       return 0;
1868     }
1869
1870   /* The static profitablity threshold min_profitable_estimate includes
1871      the cost of having to check at runtime whether the scalar loop
1872      should be used instead.  If it turns out that we don't need or want
1873      such a check, the threshold we should use for the static estimate
1874      is simply the point at which the vector loop becomes more profitable
1875      than the scalar loop.  */
1876   if (min_profitable_estimate > min_profitable_iters
1877       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1878       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1879       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1884                          " choice between the scalar and vector loops\n");
1885       min_profitable_estimate = min_profitable_iters;
1886     }
1887
1888   HOST_WIDE_INT estimated_niter;
1889
1890   /* If we are vectorizing an epilogue then we know the maximum number of
1891      scalar iterations it will cover is at least one lower than the
1892      vectorization factor of the main loop.  */
1893   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1894     estimated_niter
1895       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1896   else
1897     {
1898       estimated_niter = estimated_stmt_executions_int (loop);
1899       if (estimated_niter == -1)
1900         estimated_niter = likely_max_stmt_executions_int (loop);
1901     }
1902   if (estimated_niter != -1
1903       && ((unsigned HOST_WIDE_INT) estimated_niter
1904           < MAX (th, (unsigned) min_profitable_estimate)))
1905     {
1906       if (dump_enabled_p ())
1907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908                          "not vectorized: estimated iteration count too "
1909                          "small.\n");
1910       if (dump_enabled_p ())
1911         dump_printf_loc (MSG_NOTE, vect_location,
1912                          "not vectorized: estimated iteration count smaller "
1913                          "than specified loop bound parameter or minimum "
1914                          "profitable iterations (whichever is more "
1915                          "conservative).\n");
1916       return -1;
1917     }
1918
1919   return 1;
1920 }
1921
1922 static opt_result
1923 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1924                            vec<data_reference_p> *datarefs,
1925                            unsigned int *n_stmts)
1926 {
1927   *n_stmts = 0;
1928   for (unsigned i = 0; i < loop->num_nodes; i++)
1929     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930          !gsi_end_p (gsi); gsi_next (&gsi))
1931       {
1932         gimple *stmt = gsi_stmt (gsi);
1933         if (is_gimple_debug (stmt))
1934           continue;
1935         ++(*n_stmts);
1936         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1937                                                         NULL, 0);
1938         if (!res)
1939           {
1940             if (is_gimple_call (stmt) && loop->safelen)
1941               {
1942                 tree fndecl = gimple_call_fndecl (stmt), op;
1943                 if (fndecl != NULL_TREE)
1944                   {
1945                     cgraph_node *node = cgraph_node::get (fndecl);
1946                     if (node != NULL && node->simd_clones != NULL)
1947                       {
1948                         unsigned int j, n = gimple_call_num_args (stmt);
1949                         for (j = 0; j < n; j++)
1950                           {
1951                             op = gimple_call_arg (stmt, j);
1952                             if (DECL_P (op)
1953                                 || (REFERENCE_CLASS_P (op)
1954                                     && get_base_address (op)))
1955                               break;
1956                           }
1957                         op = gimple_call_lhs (stmt);
1958                         /* Ignore #pragma omp declare simd functions
1959                            if they don't have data references in the
1960                            call stmt itself.  */
1961                         if (j == n
1962                             && !(op
1963                                  && (DECL_P (op)
1964                                      || (REFERENCE_CLASS_P (op)
1965                                          && get_base_address (op)))))
1966                           continue;
1967                       }
1968                   }
1969               }
1970             return res;
1971           }
1972         /* If dependence analysis will give up due to the limit on the
1973            number of datarefs stop here and fail fatally.  */
1974         if (datarefs->length ()
1975             > (unsigned)param_loop_max_datarefs_for_datadeps)
1976           return opt_result::failure_at (stmt, "exceeded param "
1977                                          "loop-max-datarefs-for-datadeps\n");
1978       }
1979   return opt_result::success ();
1980 }
1981
1982 /* Look for SLP-only access groups and turn each individual access into its own
1983    group.  */
1984 static void
1985 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1986 {
1987   unsigned int i;
1988   struct data_reference *dr;
1989
1990   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1991
1992   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1993   FOR_EACH_VEC_ELT (datarefs, i, dr)
1994     {
1995       gcc_assert (DR_REF (dr));
1996       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1997
1998       /* Check if the load is a part of an interleaving chain.  */
1999       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000         {
2001           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2002           unsigned int group_size = DR_GROUP_SIZE (first_element);
2003
2004           /* Check if SLP-only groups.  */
2005           if (!STMT_SLP_TYPE (stmt_info)
2006               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2007             {
2008               /* Dissolve the group.  */
2009               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2010
2011               stmt_vec_info vinfo = first_element;
2012               while (vinfo)
2013                 {
2014                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2015                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2016                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2017                   DR_GROUP_SIZE (vinfo) = 1;
2018                   if (STMT_VINFO_STRIDED_P (first_element))
2019                     DR_GROUP_GAP (vinfo) = 0;
2020                   else
2021                     DR_GROUP_GAP (vinfo) = group_size - 1;
2022                   vinfo = next;
2023                 }
2024             }
2025         }
2026     }
2027 }
2028
2029 /* Determine if operating on full vectors for LOOP_VINFO might leave
2030    some scalar iterations still to do.  If so, decide how we should
2031    handle those scalar iterations.  The possibilities are:
2032
2033    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2034        In this case:
2035
2036          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2037          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2038          LOOP_VINFO_PEELING_FOR_NITER == false
2039
2040    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2041        to handle the remaining scalar iterations.  In this case:
2042
2043          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2044          LOOP_VINFO_PEELING_FOR_NITER == true
2045
2046        There are two choices:
2047
2048        (2a) Consider vectorizing the epilogue loop at the same VF as the
2049             main loop, but using partial vectors instead of full vectors.
2050             In this case:
2051
2052               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2053
2054        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2055             In this case:
2056
2057               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2058
2059    When FOR_EPILOGUE_P is true, make this determination based on the
2060    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2061    based on the assumption that LOOP_VINFO is the main loop.  The caller
2062    has made sure that the number of iterations is set appropriately for
2063    this value of FOR_EPILOGUE_P.  */
2064
2065 opt_result
2066 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2067                                             bool for_epilogue_p)
2068 {
2069   /* Determine whether there would be any scalar iterations left over.  */
2070   bool need_peeling_or_partial_vectors_p
2071     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2072
2073   /* Decide whether to vectorize the loop with partial vectors.  */
2074   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2075   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2076   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2077       && need_peeling_or_partial_vectors_p)
2078     {
2079       /* For partial-vector-usage=1, try to push the handling of partial
2080          vectors to the epilogue, with the main loop continuing to operate
2081          on full vectors.
2082
2083          ??? We could then end up failing to use partial vectors if we
2084          decide to peel iterations into a prologue, and if the main loop
2085          then ends up processing fewer than VF iterations.  */
2086       if (param_vect_partial_vector_usage == 1
2087           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2088           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2089         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2090       else
2091         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2092     }
2093
2094   if (dump_enabled_p ())
2095     {
2096       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2097         dump_printf_loc (MSG_NOTE, vect_location,
2098                          "operating on partial vectors%s.\n",
2099                          for_epilogue_p ? " for epilogue loop" : "");
2100       else
2101         dump_printf_loc (MSG_NOTE, vect_location,
2102                          "operating only on full vectors%s.\n",
2103                          for_epilogue_p ? " for epilogue loop" : "");
2104     }
2105
2106   if (for_epilogue_p)
2107     {
2108       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2109       gcc_assert (orig_loop_vinfo);
2110       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2112                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2113     }
2114
2115   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2116       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2117     {
2118       /* Check that the loop processes at least one full vector.  */
2119       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2120       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2121       if (known_lt (wi::to_widest (scalar_niters), vf))
2122         return opt_result::failure_at (vect_location,
2123                                        "loop does not have enough iterations"
2124                                        " to support vectorization.\n");
2125
2126       /* If we need to peel an extra epilogue iteration to handle data
2127          accesses with gaps, check that there are enough scalar iterations
2128          available.
2129
2130          The check above is redundant with this one when peeling for gaps,
2131          but the distinction is useful for diagnostics.  */
2132       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2133       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2135         return opt_result::failure_at (vect_location,
2136                                        "loop does not have enough iterations"
2137                                        " to support peeling for gaps.\n");
2138     }
2139
2140   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2141     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2142        && need_peeling_or_partial_vectors_p);
2143
2144   return opt_result::success ();
2145 }
2146
2147 /* Function vect_analyze_loop_2.
2148
2149    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2150    for it.  The different analyses will record information in the
2151    loop_vec_info struct.  */
2152 static opt_result
2153 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2154 {
2155   opt_result ok = opt_result::success ();
2156   int res;
2157   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2158   poly_uint64 min_vf = 2;
2159   loop_vec_info orig_loop_vinfo = NULL;
2160
2161   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2162      loop_vec_info of the first vectorized loop.  */
2163   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2164     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165   else
2166     orig_loop_vinfo = loop_vinfo;
2167   gcc_assert (orig_loop_vinfo);
2168
2169   /* The first group of checks is independent of the vector size.  */
2170   fatal = true;
2171
2172   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2173       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2174     return opt_result::failure_at (vect_location,
2175                                    "not vectorized: simd if(0)\n");
2176
2177   /* Find all data references in the loop (which correspond to vdefs/vuses)
2178      and analyze their evolution in the loop.  */
2179
2180   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2181
2182   /* Gather the data references and count stmts in the loop.  */
2183   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2184     {
2185       opt_result res
2186         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2187                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2188                                      n_stmts);
2189       if (!res)
2190         {
2191           if (dump_enabled_p ())
2192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                              "not vectorized: loop contains function "
2194                              "calls or data references that cannot "
2195                              "be analyzed\n");
2196           return res;
2197         }
2198       loop_vinfo->shared->save_datarefs ();
2199     }
2200   else
2201     loop_vinfo->shared->check_datarefs ();
2202
2203   /* Analyze the data references and also adjust the minimal
2204      vectorization factor according to the loads and stores.  */
2205
2206   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2207   if (!ok)
2208     {
2209       if (dump_enabled_p ())
2210         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211                          "bad data references.\n");
2212       return ok;
2213     }
2214
2215   /* Classify all cross-iteration scalar data-flow cycles.
2216      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2217   vect_analyze_scalar_cycles (loop_vinfo);
2218
2219   vect_pattern_recog (loop_vinfo);
2220
2221   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2222
2223   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2224      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2225
2226   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2227   if (!ok)
2228     {
2229       if (dump_enabled_p ())
2230         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231                          "bad data access.\n");
2232       return ok;
2233     }
2234
2235   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2236
2237   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2238   if (!ok)
2239     {
2240       if (dump_enabled_p ())
2241         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242                          "unexpected pattern.\n");
2243       return ok;
2244     }
2245
2246   /* While the rest of the analysis below depends on it in some way.  */
2247   fatal = false;
2248
2249   /* Analyze data dependences between the data-refs in the loop
2250      and adjust the maximum vectorization factor according to
2251      the dependences.
2252      FORNOW: fail at the first data dependence that we encounter.  */
2253
2254   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2255   if (!ok)
2256     {
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                          "bad data dependence.\n");
2260       return ok;
2261     }
2262   if (max_vf != MAX_VECTORIZATION_FACTOR
2263       && maybe_lt (max_vf, min_vf))
2264     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2265   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266
2267   ok = vect_determine_vectorization_factor (loop_vinfo);
2268   if (!ok)
2269     {
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272                          "can't determine vectorization factor.\n");
2273       return ok;
2274     }
2275   if (max_vf != MAX_VECTORIZATION_FACTOR
2276       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2277     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2278
2279   /* Compute the scalar iteration cost.  */
2280   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2281
2282   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2283
2284   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2285   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2286   if (!ok)
2287     return ok;
2288
2289   /* If there are any SLP instances mark them as pure_slp.  */
2290   bool slp = vect_make_slp_decision (loop_vinfo);
2291   if (slp)
2292     {
2293       /* Find stmts that need to be both vectorized and SLPed.  */
2294       vect_detect_hybrid_slp (loop_vinfo);
2295
2296       /* Update the vectorization factor based on the SLP decision.  */
2297       vect_update_vf_for_slp (loop_vinfo);
2298
2299       /* Optimize the SLP graph with the vectorization factor fixed.  */
2300       vect_optimize_slp (loop_vinfo);
2301     }
2302
2303   bool saved_can_use_partial_vectors_p
2304     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2305
2306   /* We don't expect to have to roll back to anything other than an empty
2307      set of rgroups.  */
2308   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2309
2310   /* This is the point where we can re-start analysis with SLP forced off.  */
2311 start_over:
2312
2313   /* Now the vectorization factor is final.  */
2314   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2315   gcc_assert (known_ne (vectorization_factor, 0U));
2316
2317   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2318     {
2319       dump_printf_loc (MSG_NOTE, vect_location,
2320                        "vectorization_factor = ");
2321       dump_dec (MSG_NOTE, vectorization_factor);
2322       dump_printf (MSG_NOTE, ", niters = %wd\n",
2323                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2324     }
2325
2326   /* Analyze the alignment of the data-refs in the loop.
2327      Fail if a data reference is found that cannot be vectorized.  */
2328
2329   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                          "bad data alignment.\n");
2335       return ok;
2336     }
2337
2338   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339      It is important to call pruning after vect_analyze_data_ref_accesses,
2340      since we use grouping information gathered by interleaving analysis.  */
2341   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342   if (!ok)
2343     return ok;
2344
2345   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346      vectorization, since we do not want to add extra peeling or
2347      add versioning for alignment.  */
2348   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349     /* This pass will decide on using loop versioning and/or loop peeling in
2350        order to enhance the alignment of data references in the loop.  */
2351     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352   if (!ok)
2353     return ok;
2354
2355   if (slp)
2356     {
2357       /* Analyze operations in the SLP instances.  Note this may
2358          remove unsupported SLP instances which makes the above
2359          SLP kind detection invalid.  */
2360       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2361       vect_slp_analyze_operations (loop_vinfo);
2362       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2363         {
2364           ok = opt_result::failure_at (vect_location,
2365                                        "unsupported SLP instances\n");
2366           goto again;
2367         }
2368
2369       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2370       slp_tree load_node, slp_root;
2371       unsigned i, x;
2372       slp_instance instance;
2373       bool can_use_lanes = true;
2374       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2375         {
2376           slp_root = SLP_INSTANCE_TREE (instance);
2377           int group_size = SLP_TREE_LANES (slp_root);
2378           tree vectype = SLP_TREE_VECTYPE (slp_root);
2379           bool loads_permuted = false;
2380           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2381             {
2382               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2383                 continue;
2384               unsigned j;
2385               stmt_vec_info load_info;
2386               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2387                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2388                   {
2389                     loads_permuted = true;
2390                     break;
2391                   }
2392             }
2393
2394           /* If the loads and stores can be handled with load/store-lane
2395              instructions record it and move on to the next instance.  */
2396           if (loads_permuted
2397               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2398               && vect_store_lanes_supported (vectype, group_size, false))
2399             {
2400               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2401                 {
2402                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2403                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2404                   /* Use SLP for strided accesses (or if we can't
2405                      load-lanes).  */
2406                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2407                       || ! vect_load_lanes_supported
2408                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2409                              DR_GROUP_SIZE (stmt_vinfo), false))
2410                     break;
2411                 }
2412
2413               can_use_lanes
2414                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2415
2416               if (can_use_lanes && dump_enabled_p ())
2417                 dump_printf_loc (MSG_NOTE, vect_location,
2418                                  "SLP instance %p can use load/store-lanes\n",
2419                                  instance);
2420             }
2421           else
2422             {
2423               can_use_lanes = false;
2424               break;
2425             }
2426         }
2427
2428       /* If all SLP instances can use load/store-lanes abort SLP and try again
2429          with SLP disabled.  */
2430       if (can_use_lanes)
2431         {
2432           ok = opt_result::failure_at (vect_location,
2433                                        "Built SLP cancelled: can use "
2434                                        "load/store-lanes\n");
2435           if (dump_enabled_p ())
2436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2437                              "Built SLP cancelled: all SLP instances support "
2438                              "load/store-lanes\n");
2439           goto again;
2440         }
2441     }
2442
2443   /* Dissolve SLP-only groups.  */
2444   vect_dissolve_slp_only_groups (loop_vinfo);
2445
2446   /* Scan all the remaining operations in the loop that are not subject
2447      to SLP and make sure they are vectorizable.  */
2448   ok = vect_analyze_loop_operations (loop_vinfo);
2449   if (!ok)
2450     {
2451       if (dump_enabled_p ())
2452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                          "bad operation or unsupported loop bound.\n");
2454       return ok;
2455     }
2456
2457   /* For now, we don't expect to mix both masking and length approaches for one
2458      loop, disable it if both are recorded.  */
2459   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2460       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2461       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2462     {
2463       if (dump_enabled_p ())
2464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465                          "can't vectorize a loop with partial vectors"
2466                          " because we don't expect to mix different"
2467                          " approaches with partial vectors for the"
2468                          " same loop.\n");
2469       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2470     }
2471
2472   /* If we still have the option of using partial vectors,
2473      check whether we can generate the necessary loop controls.  */
2474   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2475       && !vect_verify_full_masking (loop_vinfo)
2476       && !vect_verify_loop_lens (loop_vinfo))
2477     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2478
2479   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2480      to be able to handle fewer than VF scalars, or needs to have a lower VF
2481      than the main loop.  */
2482   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2483       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2484       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2485                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2486     return opt_result::failure_at (vect_location,
2487                                    "Vectorization factor too high for"
2488                                    " epilogue loop.\n");
2489
2490   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2491      assuming that the loop will be used as a main loop.  We will redo
2492      this analysis later if we instead decide to use the loop as an
2493      epilogue loop.  */
2494   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2495   if (!ok)
2496     return ok;
2497
2498   /* Check the costings of the loop make vectorizing worthwhile.  */
2499   res = vect_analyze_loop_costing (loop_vinfo);
2500   if (res < 0)
2501     {
2502       ok = opt_result::failure_at (vect_location,
2503                                    "Loop costings may not be worthwhile.\n");
2504       goto again;
2505     }
2506   if (!res)
2507     return opt_result::failure_at (vect_location,
2508                                    "Loop costings not worthwhile.\n");
2509
2510   /* If an epilogue loop is required make sure we can create one.  */
2511   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2512       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2513     {
2514       if (dump_enabled_p ())
2515         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2516       if (!vect_can_advance_ivs_p (loop_vinfo)
2517           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2518                                            single_exit (LOOP_VINFO_LOOP
2519                                                          (loop_vinfo))))
2520         {
2521           ok = opt_result::failure_at (vect_location,
2522                                        "not vectorized: can't create required "
2523                                        "epilog loop\n");
2524           goto again;
2525         }
2526     }
2527
2528   /* During peeling, we need to check if number of loop iterations is
2529      enough for both peeled prolog loop and vector loop.  This check
2530      can be merged along with threshold check of loop versioning, so
2531      increase threshold for this case if necessary.
2532
2533      If we are analyzing an epilogue we still want to check what its
2534      versioning threshold would be.  If we decide to vectorize the epilogues we
2535      will want to use the lowest versioning threshold of all epilogues and main
2536      loop.  This will enable us to enter a vectorized epilogue even when
2537      versioning the loop.  We can't simply check whether the epilogue requires
2538      versioning though since we may have skipped some versioning checks when
2539      analyzing the epilogue.  For instance, checks for alias versioning will be
2540      skipped when dealing with epilogues as we assume we already checked them
2541      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2542   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2543     {
2544       poly_uint64 niters_th = 0;
2545       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2546
2547       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2548         {
2549           /* Niters for peeled prolog loop.  */
2550           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2551             {
2552               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2553               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2554               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2555             }
2556           else
2557             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2558         }
2559
2560       /* Niters for at least one iteration of vectorized loop.  */
2561       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2562         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2563       /* One additional iteration because of peeling for gap.  */
2564       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2565         niters_th += 1;
2566
2567       /*  Use the same condition as vect_transform_loop to decide when to use
2568           the cost to determine a versioning threshold.  */
2569       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2570           && ordered_p (th, niters_th))
2571         niters_th = ordered_max (poly_uint64 (th), niters_th);
2572
2573       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2574     }
2575
2576   gcc_assert (known_eq (vectorization_factor,
2577                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2578
2579   /* Ok to vectorize!  */
2580   return opt_result::success ();
2581
2582 again:
2583   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2584   gcc_assert (!ok);
2585
2586   /* Try again with SLP forced off but if we didn't do any SLP there is
2587      no point in re-trying.  */
2588   if (!slp)
2589     return ok;
2590
2591   /* If there are reduction chains re-trying will fail anyway.  */
2592   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2593     return ok;
2594
2595   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2596      via interleaving or lane instructions.  */
2597   slp_instance instance;
2598   slp_tree node;
2599   unsigned i, j;
2600   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2601     {
2602       stmt_vec_info vinfo;
2603       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2604       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2605         continue;
2606       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2607       unsigned int size = DR_GROUP_SIZE (vinfo);
2608       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2609       if (! vect_store_lanes_supported (vectype, size, false)
2610          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2611          && ! vect_grouped_store_supported (vectype, size))
2612         return opt_result::failure_at (vinfo->stmt,
2613                                        "unsupported grouped store\n");
2614       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2615         {
2616           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2617           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2618           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2619           size = DR_GROUP_SIZE (vinfo);
2620           vectype = STMT_VINFO_VECTYPE (vinfo);
2621           if (! vect_load_lanes_supported (vectype, size, false)
2622               && ! vect_grouped_load_supported (vectype, single_element_p,
2623                                                 size))
2624             return opt_result::failure_at (vinfo->stmt,
2625                                            "unsupported grouped load\n");
2626         }
2627     }
2628
2629   if (dump_enabled_p ())
2630     dump_printf_loc (MSG_NOTE, vect_location,
2631                      "re-trying with SLP disabled\n");
2632
2633   /* Roll back state appropriately.  No SLP this time.  */
2634   slp = false;
2635   /* Restore vectorization factor as it were without SLP.  */
2636   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2637   /* Free the SLP instances.  */
2638   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2639     vect_free_slp_instance (instance);
2640   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2641   /* Reset SLP type to loop_vect on all stmts.  */
2642   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2643     {
2644       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2645       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2646            !gsi_end_p (si); gsi_next (&si))
2647         {
2648           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2649           STMT_SLP_TYPE (stmt_info) = loop_vect;
2650           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2651               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2652             {
2653               /* vectorizable_reduction adjusts reduction stmt def-types,
2654                  restore them to that of the PHI.  */
2655               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2656                 = STMT_VINFO_DEF_TYPE (stmt_info);
2657               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2658                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2659                 = STMT_VINFO_DEF_TYPE (stmt_info);
2660             }
2661         }
2662       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2663            !gsi_end_p (si); gsi_next (&si))
2664         {
2665           if (is_gimple_debug (gsi_stmt (si)))
2666             continue;
2667           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2668           STMT_SLP_TYPE (stmt_info) = loop_vect;
2669           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2670             {
2671               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2672               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2673               STMT_SLP_TYPE (stmt_info) = loop_vect;
2674               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2675                    !gsi_end_p (pi); gsi_next (&pi))
2676                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2677                   = loop_vect;
2678             }
2679         }
2680     }
2681   /* Free optimized alias test DDRS.  */
2682   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2683   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2684   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2685   /* Reset target cost data.  */
2686   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2687   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2688     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2689   /* Reset accumulated rgroup information.  */
2690   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2691   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2692   /* Reset assorted flags.  */
2693   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2694   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2695   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2696   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2697   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2698     = saved_can_use_partial_vectors_p;
2699
2700   goto start_over;
2701 }
2702
2703 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2704    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2705    OLD_LOOP_VINFO is better unless something specifically indicates
2706    otherwise.
2707
2708    Note that this deliberately isn't a partial order.  */
2709
2710 static bool
2711 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2712                           loop_vec_info old_loop_vinfo)
2713 {
2714   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2715   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2716
2717   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2718   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2719
2720   /* Always prefer a VF of loop->simdlen over any other VF.  */
2721   if (loop->simdlen)
2722     {
2723       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2724       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2725       if (new_simdlen_p != old_simdlen_p)
2726         return new_simdlen_p;
2727     }
2728
2729   /* Limit the VFs to what is likely to be the maximum number of iterations,
2730      to handle cases in which at least one loop_vinfo is fully-masked.  */
2731   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2732   if (estimated_max_niter != -1)
2733     {
2734       if (known_le (estimated_max_niter, new_vf))
2735         new_vf = estimated_max_niter;
2736       if (known_le (estimated_max_niter, old_vf))
2737         old_vf = estimated_max_niter;
2738     }
2739
2740   /* Check whether the (fractional) cost per scalar iteration is lower
2741      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2742   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2743                              * poly_widest_int (old_vf));
2744   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2745                              * poly_widest_int (new_vf));
2746   if (maybe_lt (rel_old, rel_new))
2747     {
2748       /* When old_loop_vinfo uses a variable vectorization factor,
2749          we know that it has a lower cost for at least one runtime VF.
2750          However, we don't know how likely that VF is.
2751
2752          One option would be to compare the costs for the estimated VFs.
2753          The problem is that that can put too much pressure on the cost
2754          model.  E.g. if the estimated VF is also the lowest possible VF,
2755          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2756          for the estimated VF, we'd then choose new_loop_vinfo even
2757          though (a) new_loop_vinfo might not actually be better than
2758          old_loop_vinfo for that VF and (b) it would be significantly
2759          worse at larger VFs.
2760
2761          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2762          no more expensive than old_loop_vinfo even after doubling the
2763          estimated old_loop_vinfo VF.  For all but trivial loops, this
2764          ensures that we only pick new_loop_vinfo if it is significantly
2765          better than old_loop_vinfo at the estimated VF.  */
2766       if (rel_new.is_constant ())
2767         return false;
2768
2769       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2770       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2771       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2772                                       * widest_int (old_estimated_vf));
2773       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2774                                       * widest_int (new_estimated_vf));
2775       return estimated_rel_new * 2 <= estimated_rel_old;
2776     }
2777   if (known_lt (rel_new, rel_old))
2778     return true;
2779
2780   /* If there's nothing to choose between the loop bodies, see whether
2781      there's a difference in the prologue and epilogue costs.  */
2782   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2783     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2784
2785   return false;
2786 }
2787
2788 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2789    true if we should.  */
2790
2791 static bool
2792 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2793                         loop_vec_info old_loop_vinfo)
2794 {
2795   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2796     return false;
2797
2798   if (dump_enabled_p ())
2799     dump_printf_loc (MSG_NOTE, vect_location,
2800                      "***** Preferring vector mode %s to vector mode %s\n",
2801                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2802                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2803   return true;
2804 }
2805
2806 /* Function vect_analyze_loop.
2807
2808    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2809    for it.  The different analyses will record information in the
2810    loop_vec_info struct.  */
2811 opt_loop_vec_info
2812 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2813 {
2814   auto_vector_modes vector_modes;
2815
2816   /* Autodetect first vector size we try.  */
2817   unsigned int autovec_flags
2818     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2819                                                     loop->simdlen != 0);
2820   unsigned int mode_i = 0;
2821
2822   DUMP_VECT_SCOPE ("analyze_loop_nest");
2823
2824   if (loop_outer (loop)
2825       && loop_vec_info_for_loop (loop_outer (loop))
2826       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2827     return opt_loop_vec_info::failure_at (vect_location,
2828                                           "outer-loop already vectorized.\n");
2829
2830   if (!find_loop_nest (loop, &shared->loop_nest))
2831     return opt_loop_vec_info::failure_at
2832       (vect_location,
2833        "not vectorized: loop nest containing two or more consecutive inner"
2834        " loops cannot be vectorized\n");
2835
2836   unsigned n_stmts = 0;
2837   machine_mode autodetected_vector_mode = VOIDmode;
2838   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2839   machine_mode next_vector_mode = VOIDmode;
2840   poly_uint64 lowest_th = 0;
2841   unsigned vectorized_loops = 0;
2842   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2843                              && !unlimited_cost_model (loop));
2844
2845   bool vect_epilogues = false;
2846   opt_result res = opt_result::success ();
2847   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2848   while (1)
2849     {
2850       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2851       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2852       if (!loop_vinfo)
2853         {
2854           if (dump_enabled_p ())
2855             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856                              "bad loop form.\n");
2857           gcc_checking_assert (first_loop_vinfo == NULL);
2858           return loop_vinfo;
2859         }
2860       loop_vinfo->vector_mode = next_vector_mode;
2861
2862       bool fatal = false;
2863
2864       /* When pick_lowest_cost_p is true, we should in principle iterate
2865          over all the loop_vec_infos that LOOP_VINFO could replace and
2866          try to vectorize LOOP_VINFO under the same conditions.
2867          E.g. when trying to replace an epilogue loop, we should vectorize
2868          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2869          to replace the main loop, we should vectorize LOOP_VINFO as a main
2870          loop too.
2871
2872          However, autovectorize_vector_modes is usually sorted as follows:
2873
2874          - Modes that naturally produce lower VFs usually follow modes that
2875            naturally produce higher VFs.
2876
2877          - When modes naturally produce the same VF, maskable modes
2878            usually follow unmaskable ones, so that the maskable mode
2879            can be used to vectorize the epilogue of the unmaskable mode.
2880
2881          This order is preferred because it leads to the maximum
2882          epilogue vectorization opportunities.  Targets should only use
2883          a different order if they want to make wide modes available while
2884          disparaging them relative to earlier, smaller modes.  The assumption
2885          in that case is that the wider modes are more expensive in some
2886          way that isn't reflected directly in the costs.
2887
2888          There should therefore be few interesting cases in which
2889          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2890          treated as a standalone loop, and ends up being genuinely cheaper
2891          than FIRST_LOOP_VINFO.  */
2892       if (vect_epilogues)
2893         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2894
2895       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2896       if (mode_i == 0)
2897         autodetected_vector_mode = loop_vinfo->vector_mode;
2898       if (dump_enabled_p ())
2899         {
2900           if (res)
2901             dump_printf_loc (MSG_NOTE, vect_location,
2902                              "***** Analysis succeeded with vector mode %s\n",
2903                              GET_MODE_NAME (loop_vinfo->vector_mode));
2904           else
2905             dump_printf_loc (MSG_NOTE, vect_location,
2906                              "***** Analysis failed with vector mode %s\n",
2907                              GET_MODE_NAME (loop_vinfo->vector_mode));
2908         }
2909
2910       loop->aux = NULL;
2911
2912       if (!fatal)
2913         while (mode_i < vector_modes.length ()
2914                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2915           {
2916             if (dump_enabled_p ())
2917               dump_printf_loc (MSG_NOTE, vect_location,
2918                                "***** The result for vector mode %s would"
2919                                " be the same\n",
2920                                GET_MODE_NAME (vector_modes[mode_i]));
2921             mode_i += 1;
2922           }
2923
2924       if (res)
2925         {
2926           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2927           vectorized_loops++;
2928
2929           /* Once we hit the desired simdlen for the first time,
2930              discard any previous attempts.  */
2931           if (simdlen
2932               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2933             {
2934               delete first_loop_vinfo;
2935               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2936               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2937               simdlen = 0;
2938             }
2939           else if (pick_lowest_cost_p && first_loop_vinfo)
2940             {
2941               /* Keep trying to roll back vectorization attempts while the
2942                  loop_vec_infos they produced were worse than this one.  */
2943               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2944               while (!vinfos.is_empty ()
2945                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2946                 {
2947                   gcc_assert (vect_epilogues);
2948                   delete vinfos.pop ();
2949                 }
2950               if (vinfos.is_empty ()
2951                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2952                 {
2953                   delete first_loop_vinfo;
2954                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2955                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2956                 }
2957             }
2958
2959           if (first_loop_vinfo == NULL)
2960             {
2961               first_loop_vinfo = loop_vinfo;
2962               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2963             }
2964           else if (vect_epilogues
2965                    /* For now only allow one epilogue loop.  */
2966                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2967             {
2968               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2969               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2970               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2971                           || maybe_ne (lowest_th, 0U));
2972               /* Keep track of the known smallest versioning
2973                  threshold.  */
2974               if (ordered_p (lowest_th, th))
2975                 lowest_th = ordered_min (lowest_th, th);
2976             }
2977           else
2978             {
2979               delete loop_vinfo;
2980               loop_vinfo = opt_loop_vec_info::success (NULL);
2981             }
2982
2983           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2984              enabled, SIMDUID is not set, it is the innermost loop and we have
2985              either already found the loop's SIMDLEN or there was no SIMDLEN to
2986              begin with.
2987              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2988           vect_epilogues = (!simdlen
2989                             && loop->inner == NULL
2990                             && param_vect_epilogues_nomask
2991                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2992                             && !loop->simduid
2993                             /* For now only allow one epilogue loop, but allow
2994                                pick_lowest_cost_p to replace it.  */
2995                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2996                                 || pick_lowest_cost_p));
2997
2998           /* Commit to first_loop_vinfo if we have no reason to try
2999              alternatives.  */
3000           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3001             break;
3002         }
3003       else
3004         {
3005           delete loop_vinfo;
3006           loop_vinfo = opt_loop_vec_info::success (NULL);
3007           if (fatal)
3008             {
3009               gcc_checking_assert (first_loop_vinfo == NULL);
3010               break;
3011             }
3012         }
3013
3014       /* Handle the case that the original loop can use partial
3015          vectorization, but want to only adopt it for the epilogue.
3016          The retry should be in the same mode as original.  */
3017       if (vect_epilogues
3018           && loop_vinfo
3019           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3020         {
3021           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3022                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3023           if (dump_enabled_p ())
3024             dump_printf_loc (MSG_NOTE, vect_location,
3025                              "***** Re-trying analysis with same vector mode"
3026                              " %s for epilogue with partial vectors.\n",
3027                              GET_MODE_NAME (loop_vinfo->vector_mode));
3028           continue;
3029         }
3030
3031       if (mode_i < vector_modes.length ()
3032           && VECTOR_MODE_P (autodetected_vector_mode)
3033           && (related_vector_mode (vector_modes[mode_i],
3034                                    GET_MODE_INNER (autodetected_vector_mode))
3035               == autodetected_vector_mode)
3036           && (related_vector_mode (autodetected_vector_mode,
3037                                    GET_MODE_INNER (vector_modes[mode_i]))
3038               == vector_modes[mode_i]))
3039         {
3040           if (dump_enabled_p ())
3041             dump_printf_loc (MSG_NOTE, vect_location,
3042                              "***** Skipping vector mode %s, which would"
3043                              " repeat the analysis for %s\n",
3044                              GET_MODE_NAME (vector_modes[mode_i]),
3045                              GET_MODE_NAME (autodetected_vector_mode));
3046           mode_i += 1;
3047         }
3048
3049       if (mode_i == vector_modes.length ()
3050           || autodetected_vector_mode == VOIDmode)
3051         break;
3052
3053       /* Try the next biggest vector size.  */
3054       next_vector_mode = vector_modes[mode_i++];
3055       if (dump_enabled_p ())
3056         dump_printf_loc (MSG_NOTE, vect_location,
3057                          "***** Re-trying analysis with vector mode %s\n",
3058                          GET_MODE_NAME (next_vector_mode));
3059     }
3060
3061   if (first_loop_vinfo)
3062     {
3063       loop->aux = (loop_vec_info) first_loop_vinfo;
3064       if (dump_enabled_p ())
3065         dump_printf_loc (MSG_NOTE, vect_location,
3066                          "***** Choosing vector mode %s\n",
3067                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3068       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3069       return first_loop_vinfo;
3070     }
3071
3072   return opt_loop_vec_info::propagate_failure (res);
3073 }
3074
3075 /* Return true if there is an in-order reduction function for CODE, storing
3076    it in *REDUC_FN if so.  */
3077
3078 static bool
3079 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3080 {
3081   switch (code)
3082     {
3083     case PLUS_EXPR:
3084       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3085       return true;
3086
3087     default:
3088       return false;
3089     }
3090 }
3091
3092 /* Function reduction_fn_for_scalar_code
3093
3094    Input:
3095    CODE - tree_code of a reduction operations.
3096
3097    Output:
3098    REDUC_FN - the corresponding internal function to be used to reduce the
3099       vector of partial results into a single scalar result, or IFN_LAST
3100       if the operation is a supported reduction operation, but does not have
3101       such an internal function.
3102
3103    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3104
3105 static bool
3106 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3107 {
3108   switch (code)
3109     {
3110       case MAX_EXPR:
3111         *reduc_fn = IFN_REDUC_MAX;
3112         return true;
3113
3114       case MIN_EXPR:
3115         *reduc_fn = IFN_REDUC_MIN;
3116         return true;
3117
3118       case PLUS_EXPR:
3119         *reduc_fn = IFN_REDUC_PLUS;
3120         return true;
3121
3122       case BIT_AND_EXPR:
3123         *reduc_fn = IFN_REDUC_AND;
3124         return true;
3125
3126       case BIT_IOR_EXPR:
3127         *reduc_fn = IFN_REDUC_IOR;
3128         return true;
3129
3130       case BIT_XOR_EXPR:
3131         *reduc_fn = IFN_REDUC_XOR;
3132         return true;
3133
3134       case MULT_EXPR:
3135       case MINUS_EXPR:
3136         *reduc_fn = IFN_LAST;
3137         return true;
3138
3139       default:
3140        return false;
3141     }
3142 }
3143
3144 /* If there is a neutral value X such that SLP reduction NODE would not
3145    be affected by the introduction of additional X elements, return that X,
3146    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3147    is the vector type that would hold element X.  REDUC_CHAIN is true if
3148    the SLP statements perform a single reduction, false if each statement
3149    performs an independent reduction.  */
3150
3151 static tree
3152 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3153                               tree_code code, bool reduc_chain)
3154 {
3155   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3156   stmt_vec_info stmt_vinfo = stmts[0];
3157   tree scalar_type = TREE_TYPE (vector_type);
3158   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3159   gcc_assert (loop);
3160
3161   switch (code)
3162     {
3163     case WIDEN_SUM_EXPR:
3164     case DOT_PROD_EXPR:
3165     case SAD_EXPR:
3166     case PLUS_EXPR:
3167     case MINUS_EXPR:
3168     case BIT_IOR_EXPR:
3169     case BIT_XOR_EXPR:
3170       return build_zero_cst (scalar_type);
3171
3172     case MULT_EXPR:
3173       return build_one_cst (scalar_type);
3174
3175     case BIT_AND_EXPR:
3176       return build_all_ones_cst (scalar_type);
3177
3178     case MAX_EXPR:
3179     case MIN_EXPR:
3180       /* For MIN/MAX the initial values are neutral.  A reduction chain
3181          has only a single initial value, so that value is neutral for
3182          all statements.  */
3183       if (reduc_chain)
3184         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3185                                       loop_preheader_edge (loop));
3186       return NULL_TREE;
3187
3188     default:
3189       return NULL_TREE;
3190     }
3191 }
3192
3193 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3194    STMT is printed with a message MSG. */
3195
3196 static void
3197 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3198 {
3199   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3200 }
3201
3202 /* Return true if we need an in-order reduction for operation CODE
3203    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3204    overflow must wrap.  */
3205
3206 bool
3207 needs_fold_left_reduction_p (tree type, tree_code code)
3208 {
3209   /* CHECKME: check for !flag_finite_math_only too?  */
3210   if (SCALAR_FLOAT_TYPE_P (type))
3211     switch (code)
3212       {
3213       case MIN_EXPR:
3214       case MAX_EXPR:
3215         return false;
3216
3217       default:
3218         return !flag_associative_math;
3219       }
3220
3221   if (INTEGRAL_TYPE_P (type))
3222     {
3223       if (!operation_no_trapping_overflow (type, code))
3224         return true;
3225       return false;
3226     }
3227
3228   if (SAT_FIXED_POINT_TYPE_P (type))
3229     return true;
3230
3231   return false;
3232 }
3233
3234 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3235    has a handled computation expression.  Store the main reduction
3236    operation in *CODE.  */
3237
3238 static bool
3239 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3240                       tree loop_arg, enum tree_code *code,
3241                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3242 {
3243   auto_bitmap visited;
3244   tree lookfor = PHI_RESULT (phi);
3245   ssa_op_iter curri;
3246   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3247   while (USE_FROM_PTR (curr) != loop_arg)
3248     curr = op_iter_next_use (&curri);
3249   curri.i = curri.numops;
3250   do
3251     {
3252       path.safe_push (std::make_pair (curri, curr));
3253       tree use = USE_FROM_PTR (curr);
3254       if (use == lookfor)
3255         break;
3256       gimple *def = SSA_NAME_DEF_STMT (use);
3257       if (gimple_nop_p (def)
3258           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3259         {
3260 pop:
3261           do
3262             {
3263               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3264               curri = x.first;
3265               curr = x.second;
3266               do
3267                 curr = op_iter_next_use (&curri);
3268               /* Skip already visited or non-SSA operands (from iterating
3269                  over PHI args).  */
3270               while (curr != NULL_USE_OPERAND_P
3271                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3272                          || ! bitmap_set_bit (visited,
3273                                               SSA_NAME_VERSION
3274                                                 (USE_FROM_PTR (curr)))));
3275             }
3276           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3277           if (curr == NULL_USE_OPERAND_P)
3278             break;
3279         }
3280       else
3281         {
3282           if (gimple_code (def) == GIMPLE_PHI)
3283             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3284           else
3285             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3286           while (curr != NULL_USE_OPERAND_P
3287                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3288                      || ! bitmap_set_bit (visited,
3289                                           SSA_NAME_VERSION
3290                                             (USE_FROM_PTR (curr)))))
3291             curr = op_iter_next_use (&curri);
3292           if (curr == NULL_USE_OPERAND_P)
3293             goto pop;
3294         }
3295     }
3296   while (1);
3297   if (dump_file && (dump_flags & TDF_DETAILS))
3298     {
3299       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3300       unsigned i;
3301       std::pair<ssa_op_iter, use_operand_p> *x;
3302       FOR_EACH_VEC_ELT (path, i, x)
3303         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3304       dump_printf (MSG_NOTE, "\n");
3305     }
3306
3307   /* Check whether the reduction path detected is valid.  */
3308   bool fail = path.length () == 0;
3309   bool neg = false;
3310   int sign = -1;
3311   *code = ERROR_MARK;
3312   for (unsigned i = 1; i < path.length (); ++i)
3313     {
3314       gimple *use_stmt = USE_STMT (path[i].second);
3315       tree op = USE_FROM_PTR (path[i].second);
3316       if (! is_gimple_assign (use_stmt)
3317           /* The following make sure we can compute the operand index
3318              easily plus it mostly disallows chaining via COND_EXPR condition
3319              operands.  */
3320           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3321               && (gimple_num_ops (use_stmt) <= 2
3322                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3323               && (gimple_num_ops (use_stmt) <= 3
3324                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3325         {
3326           fail = true;
3327           break;
3328         }
3329       /* Check there's only a single stmt the op is used on.  For the
3330          not value-changing tail and the last stmt allow out-of-loop uses.
3331          ???  We could relax this and handle arbitrary live stmts by
3332          forcing a scalar epilogue for example.  */
3333       imm_use_iterator imm_iter;
3334       gimple *op_use_stmt;
3335       unsigned cnt = 0;
3336       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3337         if (!is_gimple_debug (op_use_stmt)
3338             && (*code != ERROR_MARK
3339                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3340           {
3341             /* We want to allow x + x but not x < 1 ? x : 2.  */
3342             if (is_gimple_assign (op_use_stmt)
3343                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3344               {
3345                 use_operand_p use_p;
3346                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3347                   cnt++;
3348               }
3349             else
3350               cnt++;
3351           }
3352       if (cnt != 1)
3353         {
3354           fail = true;
3355           break;
3356         }
3357       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3358       if (use_code == MINUS_EXPR)
3359         {
3360           use_code = PLUS_EXPR;
3361           /* Track whether we negate the reduction value each iteration.  */
3362           if (gimple_assign_rhs2 (use_stmt) == op)
3363             neg = ! neg;
3364         }
3365       if (CONVERT_EXPR_CODE_P (use_code)
3366           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3367                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3368         ;
3369       else if (*code == ERROR_MARK)
3370         {
3371           *code = use_code;
3372           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3373         }
3374       else if (use_code != *code)
3375         {
3376           fail = true;
3377           break;
3378         }
3379       else if ((use_code == MIN_EXPR
3380                 || use_code == MAX_EXPR)
3381                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3382         {
3383           fail = true;
3384           break;
3385         }
3386     }
3387   return ! fail && ! neg && *code != ERROR_MARK;
3388 }
3389
3390 bool
3391 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3392                       tree loop_arg, enum tree_code code)
3393 {
3394   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3395   enum tree_code code_;
3396   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3397           && code_ == code);
3398 }
3399
3400
3401
3402 /* Function vect_is_simple_reduction
3403
3404    (1) Detect a cross-iteration def-use cycle that represents a simple
3405    reduction computation.  We look for the following pattern:
3406
3407    loop_header:
3408      a1 = phi < a0, a2 >
3409      a3 = ...
3410      a2 = operation (a3, a1)
3411
3412    or
3413
3414    a3 = ...
3415    loop_header:
3416      a1 = phi < a0, a2 >
3417      a2 = operation (a3, a1)
3418
3419    such that:
3420    1. operation is commutative and associative and it is safe to
3421       change the order of the computation
3422    2. no uses for a2 in the loop (a2 is used out of the loop)
3423    3. no uses of a1 in the loop besides the reduction operation
3424    4. no uses of a1 outside the loop.
3425
3426    Conditions 1,4 are tested here.
3427    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3428
3429    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3430    nested cycles.
3431
3432    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3433    reductions:
3434
3435      a1 = phi < a0, a2 >
3436      inner loop (def of a3)
3437      a2 = phi < a3 >
3438
3439    (4) Detect condition expressions, ie:
3440      for (int i = 0; i < N; i++)
3441        if (a[i] < val)
3442         ret_val = a[i];
3443
3444 */
3445
3446 static stmt_vec_info
3447 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3448                           bool *double_reduc, bool *reduc_chain_p)
3449 {
3450   gphi *phi = as_a <gphi *> (phi_info->stmt);
3451   gimple *phi_use_stmt = NULL;
3452   imm_use_iterator imm_iter;
3453   use_operand_p use_p;
3454
3455   *double_reduc = false;
3456   *reduc_chain_p = false;
3457   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3458
3459   tree phi_name = PHI_RESULT (phi);
3460   /* ???  If there are no uses of the PHI result the inner loop reduction
3461      won't be detected as possibly double-reduction by vectorizable_reduction
3462      because that tries to walk the PHI arg from the preheader edge which
3463      can be constant.  See PR60382.  */
3464   if (has_zero_uses (phi_name))
3465     return NULL;
3466   class loop *loop = (gimple_bb (phi))->loop_father;
3467   unsigned nphi_def_loop_uses = 0;
3468   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3469     {
3470       gimple *use_stmt = USE_STMT (use_p);
3471       if (is_gimple_debug (use_stmt))
3472         continue;
3473
3474       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3475         {
3476           if (dump_enabled_p ())
3477             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3478                              "intermediate value used outside loop.\n");
3479
3480           return NULL;
3481         }
3482
3483       nphi_def_loop_uses++;
3484       phi_use_stmt = use_stmt;
3485     }
3486
3487   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3488   if (TREE_CODE (latch_def) != SSA_NAME)
3489     {
3490       if (dump_enabled_p ())
3491         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3492                          "reduction: not ssa_name: %T\n", latch_def);
3493       return NULL;
3494     }
3495
3496   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3497   if (!def_stmt_info
3498       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3499     return NULL;
3500
3501   bool nested_in_vect_loop
3502     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3503   unsigned nlatch_def_loop_uses = 0;
3504   auto_vec<gphi *, 3> lcphis;
3505   bool inner_loop_of_double_reduc = false;
3506   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3507     {
3508       gimple *use_stmt = USE_STMT (use_p);
3509       if (is_gimple_debug (use_stmt))
3510         continue;
3511       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3512         nlatch_def_loop_uses++;
3513       else
3514         {
3515           /* We can have more than one loop-closed PHI.  */
3516           lcphis.safe_push (as_a <gphi *> (use_stmt));
3517           if (nested_in_vect_loop
3518               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3519                   == vect_double_reduction_def))
3520             inner_loop_of_double_reduc = true;
3521         }
3522     }
3523
3524   /* If we are vectorizing an inner reduction we are executing that
3525      in the original order only in case we are not dealing with a
3526      double reduction.  */
3527   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3528     {
3529       if (dump_enabled_p ())
3530         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3531                         "detected nested cycle: ");
3532       return def_stmt_info;
3533     }
3534
3535   /* If this isn't a nested cycle or if the nested cycle reduction value
3536      is used ouside of the inner loop we cannot handle uses of the reduction
3537      value.  */
3538   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3539     {
3540       if (dump_enabled_p ())
3541         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542                          "reduction used in loop.\n");
3543       return NULL;
3544     }
3545
3546   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3547      defined in the inner loop.  */
3548   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3549     {
3550       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3551       if (gimple_phi_num_args (def_stmt) != 1
3552           || TREE_CODE (op1) != SSA_NAME)
3553         {
3554           if (dump_enabled_p ())
3555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3556                              "unsupported phi node definition.\n");
3557
3558           return NULL;
3559         }
3560
3561       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3562       if (gimple_bb (def1)
3563           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3564           && loop->inner
3565           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3566           && is_gimple_assign (def1)
3567           && is_a <gphi *> (phi_use_stmt)
3568           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3569         {
3570           if (dump_enabled_p ())
3571             report_vect_op (MSG_NOTE, def_stmt,
3572                             "detected double reduction: ");
3573
3574           *double_reduc = true;
3575           return def_stmt_info;
3576         }
3577
3578       return NULL;
3579     }
3580
3581   /* Look for the expression computing latch_def from then loop PHI result.  */
3582   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3583   enum tree_code code;
3584   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3585                             path))
3586     {
3587       STMT_VINFO_REDUC_CODE (phi_info) = code;
3588       if (code == COND_EXPR && !nested_in_vect_loop)
3589         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3590
3591       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3592          reduction chain for which the additional restriction is that
3593          all operations in the chain are the same.  */
3594       auto_vec<stmt_vec_info, 8> reduc_chain;
3595       unsigned i;
3596       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3597       for (i = path.length () - 1; i >= 1; --i)
3598         {
3599           gimple *stmt = USE_STMT (path[i].second);
3600           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3601           STMT_VINFO_REDUC_IDX (stmt_info)
3602             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3603           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3604           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3605                                      && (i == 1 || i == path.length () - 1));
3606           if ((stmt_code != code && !leading_conversion)
3607               /* We can only handle the final value in epilogue
3608                  generation for reduction chains.  */
3609               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3610             is_slp_reduc = false;
3611           /* For reduction chains we support a trailing/leading
3612              conversions.  We do not store those in the actual chain.  */
3613           if (leading_conversion)
3614             continue;
3615           reduc_chain.safe_push (stmt_info);
3616         }
3617       if (is_slp_reduc && reduc_chain.length () > 1)
3618         {
3619           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3620             {
3621               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3622               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3623             }
3624           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3625           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3626
3627           /* Save the chain for further analysis in SLP detection.  */
3628           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3629           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3630
3631           *reduc_chain_p = true;
3632           if (dump_enabled_p ())
3633             dump_printf_loc (MSG_NOTE, vect_location,
3634                             "reduction: detected reduction chain\n");
3635         }
3636       else if (dump_enabled_p ())
3637         dump_printf_loc (MSG_NOTE, vect_location,
3638                          "reduction: detected reduction\n");
3639
3640       return def_stmt_info;
3641     }
3642
3643   if (dump_enabled_p ())
3644     dump_printf_loc (MSG_NOTE, vect_location,
3645                      "reduction: unknown pattern\n");
3646
3647   return NULL;
3648 }
3649
3650 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3651    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3652    or -1 if not known.  */
3653
3654 static int
3655 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3656 {
3657   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3658   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3659     {
3660       if (dump_enabled_p ())
3661         dump_printf_loc (MSG_NOTE, vect_location,
3662                          "cost model: epilogue peel iters set to vf/2 "
3663                          "because loop iterations are unknown .\n");
3664       return assumed_vf / 2;
3665     }
3666   else
3667     {
3668       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3669       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3670       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3671       /* If we need to peel for gaps, but no peeling is required, we have to
3672          peel VF iterations.  */
3673       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3674         peel_iters_epilogue = assumed_vf;
3675       return peel_iters_epilogue;
3676     }
3677 }
3678
3679 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3680 int
3681 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3682                              int *peel_iters_epilogue,
3683                              stmt_vector_for_cost *scalar_cost_vec,
3684                              stmt_vector_for_cost *prologue_cost_vec,
3685                              stmt_vector_for_cost *epilogue_cost_vec)
3686 {
3687   int retval = 0;
3688
3689   *peel_iters_epilogue
3690     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3691
3692   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3693     {
3694       /* If peeled iterations are known but number of scalar loop
3695          iterations are unknown, count a taken branch per peeled loop.  */
3696       if (peel_iters_prologue > 0)
3697         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3698                                    NULL, NULL_TREE, 0, vect_prologue);
3699       if (*peel_iters_epilogue > 0)
3700         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3701                                     NULL, NULL_TREE, 0, vect_epilogue);
3702     }
3703
3704   stmt_info_for_cost *si;
3705   int j;
3706   if (peel_iters_prologue)
3707     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3708       retval += record_stmt_cost (prologue_cost_vec,
3709                                   si->count * peel_iters_prologue,
3710                                   si->kind, si->stmt_info, si->misalign,
3711                                   vect_prologue);
3712   if (*peel_iters_epilogue)
3713     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3714       retval += record_stmt_cost (epilogue_cost_vec,
3715                                   si->count * *peel_iters_epilogue,
3716                                   si->kind, si->stmt_info, si->misalign,
3717                                   vect_epilogue);
3718
3719   return retval;
3720 }
3721
3722 /* Function vect_estimate_min_profitable_iters
3723
3724    Return the number of iterations required for the vector version of the
3725    loop to be profitable relative to the cost of the scalar version of the
3726    loop.
3727
3728    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3729    of iterations for vectorization.  -1 value means loop vectorization
3730    is not profitable.  This returned value may be used for dynamic
3731    profitability check.
3732
3733    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3734    for static check against estimated number of iterations.  */
3735
3736 static void
3737 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3738                                     int *ret_min_profitable_niters,
3739                                     int *ret_min_profitable_estimate)
3740 {
3741   int min_profitable_iters;
3742   int min_profitable_estimate;
3743   int peel_iters_prologue;
3744   int peel_iters_epilogue;
3745   unsigned vec_inside_cost = 0;
3746   int vec_outside_cost = 0;
3747   unsigned vec_prologue_cost = 0;
3748   unsigned vec_epilogue_cost = 0;
3749   int scalar_single_iter_cost = 0;
3750   int scalar_outside_cost = 0;
3751   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3752   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3753   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3754
3755   /* Cost model disabled.  */
3756   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3757     {
3758       if (dump_enabled_p ())
3759         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3760       *ret_min_profitable_niters = 0;
3761       *ret_min_profitable_estimate = 0;
3762       return;
3763     }
3764
3765   /* Requires loop versioning tests to handle misalignment.  */
3766   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3767     {
3768       /*  FIXME: Make cost depend on complexity of individual check.  */
3769       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3770       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3771                             NULL, NULL_TREE, 0, vect_prologue);
3772       if (dump_enabled_p ())
3773         dump_printf (MSG_NOTE,
3774                      "cost model: Adding cost of checks for loop "
3775                      "versioning to treat misalignment.\n");
3776     }
3777
3778   /* Requires loop versioning with alias checks.  */
3779   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3780     {
3781       /*  FIXME: Make cost depend on complexity of individual check.  */
3782       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3783       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3784                             NULL, NULL_TREE, 0, vect_prologue);
3785       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3786       if (len)
3787         /* Count LEN - 1 ANDs and LEN comparisons.  */
3788         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3789                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3790       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3791       if (len)
3792         {
3793           /* Count LEN - 1 ANDs and LEN comparisons.  */
3794           unsigned int nstmts = len * 2 - 1;
3795           /* +1 for each bias that needs adding.  */
3796           for (unsigned int i = 0; i < len; ++i)
3797             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3798               nstmts += 1;
3799           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3800                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3801         }
3802       if (dump_enabled_p ())
3803         dump_printf (MSG_NOTE,
3804                      "cost model: Adding cost of checks for loop "
3805                      "versioning aliasing.\n");
3806     }
3807
3808   /* Requires loop versioning with niter checks.  */
3809   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3810     {
3811       /*  FIXME: Make cost depend on complexity of individual check.  */
3812       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3813                             NULL, NULL_TREE, 0, vect_prologue);
3814       if (dump_enabled_p ())
3815         dump_printf (MSG_NOTE,
3816                      "cost model: Adding cost of checks for loop "
3817                      "versioning niters.\n");
3818     }
3819
3820   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3821     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3822                           NULL, NULL_TREE, 0, vect_prologue);
3823
3824   /* Count statements in scalar loop.  Using this as scalar cost for a single
3825      iteration for now.
3826
3827      TODO: Add outer loop support.
3828
3829      TODO: Consider assigning different costs to different scalar
3830      statements.  */
3831
3832   scalar_single_iter_cost
3833     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3834
3835   /* Add additional cost for the peeled instructions in prologue and epilogue
3836      loop.  (For fully-masked loops there will be no peeling.)
3837
3838      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3839      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3840
3841      TODO: Build an expression that represents peel_iters for prologue and
3842      epilogue to be used in a run-time test.  */
3843
3844   bool prologue_need_br_taken_cost = false;
3845   bool prologue_need_br_not_taken_cost = false;
3846
3847   /* Calculate peel_iters_prologue.  */
3848   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3849     peel_iters_prologue = 0;
3850   else if (npeel < 0)
3851     {
3852       peel_iters_prologue = assumed_vf / 2;
3853       if (dump_enabled_p ())
3854         dump_printf (MSG_NOTE, "cost model: "
3855                      "prologue peel iters set to vf/2.\n");
3856
3857       /* If peeled iterations are unknown, count a taken branch and a not taken
3858          branch per peeled loop.  Even if scalar loop iterations are known,
3859          vector iterations are not known since peeled prologue iterations are
3860          not known.  Hence guards remain the same.  */
3861       prologue_need_br_taken_cost = true;
3862       prologue_need_br_not_taken_cost = true;
3863     }
3864   else
3865     {
3866       peel_iters_prologue = npeel;
3867       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3868         /* If peeled iterations are known but number of scalar loop
3869            iterations are unknown, count a taken branch per peeled loop.  */
3870         prologue_need_br_taken_cost = true;
3871     }
3872
3873   bool epilogue_need_br_taken_cost = false;
3874   bool epilogue_need_br_not_taken_cost = false;
3875
3876   /* Calculate peel_iters_epilogue.  */
3877   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3878     /* We need to peel exactly one iteration for gaps.  */
3879     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3880   else if (npeel < 0)
3881     {
3882       /* If peeling for alignment is unknown, loop bound of main loop
3883          becomes unknown.  */
3884       peel_iters_epilogue = assumed_vf / 2;
3885       if (dump_enabled_p ())
3886         dump_printf (MSG_NOTE, "cost model: "
3887                      "epilogue peel iters set to vf/2 because "
3888                      "peeling for alignment is unknown.\n");
3889
3890       /* See the same reason above in peel_iters_prologue calculation.  */
3891       epilogue_need_br_taken_cost = true;
3892       epilogue_need_br_not_taken_cost = true;
3893     }
3894   else
3895     {
3896       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3897       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3898         /* If peeled iterations are known but number of scalar loop
3899            iterations are unknown, count a taken branch per peeled loop.  */
3900         epilogue_need_br_taken_cost = true;
3901     }
3902
3903   stmt_info_for_cost *si;
3904   int j;
3905   /* Add costs associated with peel_iters_prologue.  */
3906   if (peel_iters_prologue)
3907     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3908       {
3909         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3910                               si->count * peel_iters_prologue, si->kind,
3911                               si->stmt_info, si->vectype, si->misalign,
3912                               vect_prologue);
3913       }
3914
3915   /* Add costs associated with peel_iters_epilogue.  */
3916   if (peel_iters_epilogue)
3917     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3918       {
3919         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3920                               si->count * peel_iters_epilogue, si->kind,
3921                               si->stmt_info, si->vectype, si->misalign,
3922                               vect_epilogue);
3923       }
3924
3925   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3926
3927   if (prologue_need_br_taken_cost)
3928     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3929                           NULL, NULL_TREE, 0, vect_prologue);
3930
3931   if (prologue_need_br_not_taken_cost)
3932     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3933                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3934                           vect_prologue);
3935
3936   if (epilogue_need_br_taken_cost)
3937     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3938                           NULL, NULL_TREE, 0, vect_epilogue);
3939
3940   if (epilogue_need_br_not_taken_cost)
3941     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3942                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3943                           vect_epilogue);
3944
3945   /* Take care of special costs for rgroup controls of partial vectors.  */
3946   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3947     {
3948       /* Calculate how many masks we need to generate.  */
3949       unsigned int num_masks = 0;
3950       rgroup_controls *rgm;
3951       unsigned int num_vectors_m1;
3952       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3953         if (rgm->type)
3954           num_masks += num_vectors_m1 + 1;
3955       gcc_assert (num_masks > 0);
3956
3957       /* In the worst case, we need to generate each mask in the prologue
3958          and in the loop body.  One of the loop body mask instructions
3959          replaces the comparison in the scalar loop, and since we don't
3960          count the scalar comparison against the scalar body, we shouldn't
3961          count that vector instruction against the vector body either.
3962
3963          Sometimes we can use unpacks instead of generating prologue
3964          masks and sometimes the prologue mask will fold to a constant,
3965          so the actual prologue cost might be smaller.  However, it's
3966          simpler and safer to use the worst-case cost; if this ends up
3967          being the tie-breaker between vectorizing or not, then it's
3968          probably better not to vectorize.  */
3969       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3970                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3971       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3972                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
3973     }
3974   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3975     {
3976       /* Referring to the functions vect_set_loop_condition_partial_vectors
3977          and vect_set_loop_controls_directly, we need to generate each
3978          length in the prologue and in the loop body if required. Although
3979          there are some possible optimizations, we consider the worst case
3980          here.  */
3981
3982       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3983       bool need_iterate_p
3984         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3985            && !vect_known_niters_smaller_than_vf (loop_vinfo));
3986
3987       /* Calculate how many statements to be added.  */
3988       unsigned int prologue_stmts = 0;
3989       unsigned int body_stmts = 0;
3990
3991       rgroup_controls *rgc;
3992       unsigned int num_vectors_m1;
3993       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3994         if (rgc->type)
3995           {
3996             /* May need one SHIFT for nitems_total computation.  */
3997             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3998             if (nitems != 1 && !niters_known_p)
3999               prologue_stmts += 1;
4000
4001             /* May need one MAX and one MINUS for wrap around.  */
4002             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4003               prologue_stmts += 2;
4004
4005             /* Need one MAX and one MINUS for each batch limit excepting for
4006                the 1st one.  */
4007             prologue_stmts += num_vectors_m1 * 2;
4008
4009             unsigned int num_vectors = num_vectors_m1 + 1;
4010
4011             /* Need to set up lengths in prologue, only one MIN required
4012                for each since start index is zero.  */
4013             prologue_stmts += num_vectors;
4014
4015             /* Each may need two MINs and one MINUS to update lengths in body
4016                for next iteration.  */
4017             if (need_iterate_p)
4018               body_stmts += 3 * num_vectors;
4019           }
4020
4021       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4022                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4023       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4024                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4025     }
4026
4027   /* FORNOW: The scalar outside cost is incremented in one of the
4028      following ways:
4029
4030      1. The vectorizer checks for alignment and aliasing and generates
4031      a condition that allows dynamic vectorization.  A cost model
4032      check is ANDED with the versioning condition.  Hence scalar code
4033      path now has the added cost of the versioning check.
4034
4035        if (cost > th & versioning_check)
4036          jmp to vector code
4037
4038      Hence run-time scalar is incremented by not-taken branch cost.
4039
4040      2. The vectorizer then checks if a prologue is required.  If the
4041      cost model check was not done before during versioning, it has to
4042      be done before the prologue check.
4043
4044        if (cost <= th)
4045          prologue = scalar_iters
4046        if (prologue == 0)
4047          jmp to vector code
4048        else
4049          execute prologue
4050        if (prologue == num_iters)
4051          go to exit
4052
4053      Hence the run-time scalar cost is incremented by a taken branch,
4054      plus a not-taken branch, plus a taken branch cost.
4055
4056      3. The vectorizer then checks if an epilogue is required.  If the
4057      cost model check was not done before during prologue check, it
4058      has to be done with the epilogue check.
4059
4060        if (prologue == 0)
4061          jmp to vector code
4062        else
4063          execute prologue
4064        if (prologue == num_iters)
4065          go to exit
4066        vector code:
4067          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4068            jmp to epilogue
4069
4070      Hence the run-time scalar cost should be incremented by 2 taken
4071      branches.
4072
4073      TODO: The back end may reorder the BBS's differently and reverse
4074      conditions/branch directions.  Change the estimates below to
4075      something more reasonable.  */
4076
4077   /* If the number of iterations is known and we do not do versioning, we can
4078      decide whether to vectorize at compile time.  Hence the scalar version
4079      do not carry cost model guard costs.  */
4080   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4081       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4082     {
4083       /* Cost model check occurs at versioning.  */
4084       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4085         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4086       else
4087         {
4088           /* Cost model check occurs at prologue generation.  */
4089           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4090             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4091               + vect_get_stmt_cost (cond_branch_not_taken);
4092           /* Cost model check occurs at epilogue generation.  */
4093           else
4094             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4095         }
4096     }
4097
4098   /* Complete the target-specific cost calculations.  */
4099   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4100                &vec_inside_cost, &vec_epilogue_cost);
4101
4102   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4103
4104   /* Stash the costs so that we can compare two loop_vec_infos.  */
4105   loop_vinfo->vec_inside_cost = vec_inside_cost;
4106   loop_vinfo->vec_outside_cost = vec_outside_cost;
4107
4108   if (dump_enabled_p ())
4109     {
4110       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4111       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4112                    vec_inside_cost);
4113       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4114                    vec_prologue_cost);
4115       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4116                    vec_epilogue_cost);
4117       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4118                    scalar_single_iter_cost);
4119       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4120                    scalar_outside_cost);
4121       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4122                    vec_outside_cost);
4123       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4124                    peel_iters_prologue);
4125       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4126                    peel_iters_epilogue);
4127     }
4128
4129   /* Calculate number of iterations required to make the vector version
4130      profitable, relative to the loop bodies only.  The following condition
4131      must hold true:
4132      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4133      where
4134      SIC = scalar iteration cost, VIC = vector iteration cost,
4135      VOC = vector outside cost, VF = vectorization factor,
4136      NPEEL = prologue iterations + epilogue iterations,
4137      SOC = scalar outside cost for run time cost model check.  */
4138
4139   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4140                           - vec_inside_cost);
4141   if (saving_per_viter <= 0)
4142     {
4143       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4144         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4145                     "vectorization did not happen for a simd loop");
4146
4147       if (dump_enabled_p ())
4148         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4149                          "cost model: the vector iteration cost = %d "
4150                          "divided by the scalar iteration cost = %d "
4151                          "is greater or equal to the vectorization factor = %d"
4152                          ".\n",
4153                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4154       *ret_min_profitable_niters = -1;
4155       *ret_min_profitable_estimate = -1;
4156       return;
4157     }
4158
4159   /* ??? The "if" arm is written to handle all cases; see below for what
4160      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4161   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4162     {
4163       /* Rewriting the condition above in terms of the number of
4164          vector iterations (vniters) rather than the number of
4165          scalar iterations (niters) gives:
4166
4167          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4168
4169          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4170
4171          For integer N, X and Y when X > 0:
4172
4173          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4174       int outside_overhead = (vec_outside_cost
4175                               - scalar_single_iter_cost * peel_iters_prologue
4176                               - scalar_single_iter_cost * peel_iters_epilogue
4177                               - scalar_outside_cost);
4178       /* We're only interested in cases that require at least one
4179          vector iteration.  */
4180       int min_vec_niters = 1;
4181       if (outside_overhead > 0)
4182         min_vec_niters = outside_overhead / saving_per_viter + 1;
4183
4184       if (dump_enabled_p ())
4185         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4186                      min_vec_niters);
4187
4188       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4189         {
4190           /* Now that we know the minimum number of vector iterations,
4191              find the minimum niters for which the scalar cost is larger:
4192
4193              SIC * niters > VIC * vniters + VOC - SOC
4194
4195              We know that the minimum niters is no more than
4196              vniters * VF + NPEEL, but it might be (and often is) less
4197              than that if a partial vector iteration is cheaper than the
4198              equivalent scalar code.  */
4199           int threshold = (vec_inside_cost * min_vec_niters
4200                            + vec_outside_cost
4201                            - scalar_outside_cost);
4202           if (threshold <= 0)
4203             min_profitable_iters = 1;
4204           else
4205             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4206         }
4207       else
4208         /* Convert the number of vector iterations into a number of
4209            scalar iterations.  */
4210         min_profitable_iters = (min_vec_niters * assumed_vf
4211                                 + peel_iters_prologue
4212                                 + peel_iters_epilogue);
4213     }
4214   else
4215     {
4216       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4217                               * assumed_vf
4218                               - vec_inside_cost * peel_iters_prologue
4219                               - vec_inside_cost * peel_iters_epilogue);
4220       if (min_profitable_iters <= 0)
4221         min_profitable_iters = 0;
4222       else
4223         {
4224           min_profitable_iters /= saving_per_viter;
4225
4226           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4227               <= (((int) vec_inside_cost * min_profitable_iters)
4228                   + (((int) vec_outside_cost - scalar_outside_cost)
4229                      * assumed_vf)))
4230             min_profitable_iters++;
4231         }
4232     }
4233
4234   if (dump_enabled_p ())
4235     dump_printf (MSG_NOTE,
4236                  "  Calculated minimum iters for profitability: %d\n",
4237                  min_profitable_iters);
4238
4239   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4240       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4241     /* We want the vectorized loop to execute at least once.  */
4242     min_profitable_iters = assumed_vf + peel_iters_prologue;
4243   else if (min_profitable_iters < peel_iters_prologue)
4244     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4245        vectorized loop executes at least once.  */
4246     min_profitable_iters = peel_iters_prologue;
4247
4248   if (dump_enabled_p ())
4249     dump_printf_loc (MSG_NOTE, vect_location,
4250                      "  Runtime profitability threshold = %d\n",
4251                      min_profitable_iters);
4252
4253   *ret_min_profitable_niters = min_profitable_iters;
4254
4255   /* Calculate number of iterations required to make the vector version
4256      profitable, relative to the loop bodies only.
4257
4258      Non-vectorized variant is SIC * niters and it must win over vector
4259      variant on the expected loop trip count.  The following condition must hold true:
4260      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4261
4262   if (vec_outside_cost <= 0)
4263     min_profitable_estimate = 0;
4264   /* ??? This "else if" arm is written to handle all cases; see below for
4265      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4266   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4267     {
4268       /* This is a repeat of the code above, but with + SOC rather
4269          than - SOC.  */
4270       int outside_overhead = (vec_outside_cost
4271                               - scalar_single_iter_cost * peel_iters_prologue
4272                               - scalar_single_iter_cost * peel_iters_epilogue
4273                               + scalar_outside_cost);
4274       int min_vec_niters = 1;
4275       if (outside_overhead > 0)
4276         min_vec_niters = outside_overhead / saving_per_viter + 1;
4277
4278       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4279         {
4280           int threshold = (vec_inside_cost * min_vec_niters
4281                            + vec_outside_cost
4282                            + scalar_outside_cost);
4283           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4284         }
4285       else
4286         min_profitable_estimate = (min_vec_niters * assumed_vf
4287                                    + peel_iters_prologue
4288                                    + peel_iters_epilogue);
4289     }
4290   else
4291     {
4292       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4293                                  * assumed_vf
4294                                  - vec_inside_cost * peel_iters_prologue
4295                                  - vec_inside_cost * peel_iters_epilogue)
4296                                  / ((scalar_single_iter_cost * assumed_vf)
4297                                    - vec_inside_cost);
4298     }
4299   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4300   if (dump_enabled_p ())
4301     dump_printf_loc (MSG_NOTE, vect_location,
4302                      "  Static estimate profitability threshold = %d\n",
4303                      min_profitable_estimate);
4304
4305   *ret_min_profitable_estimate = min_profitable_estimate;
4306 }
4307
4308 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4309    vector elements (not bits) for a vector with NELT elements.  */
4310 static void
4311 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4312                               vec_perm_builder *sel)
4313 {
4314   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4315      by vec_perm_indices.  */
4316   sel->new_vector (nelt, 1, 3);
4317   for (unsigned int i = 0; i < 3; i++)
4318     sel->quick_push (i + offset);
4319 }
4320
4321 /* Checks whether the target supports whole-vector shifts for vectors of mode
4322    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4323    it supports vec_perm_const with masks for all necessary shift amounts.  */
4324 static bool
4325 have_whole_vector_shift (machine_mode mode)
4326 {
4327   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4328     return true;
4329
4330   /* Variable-length vectors should be handled via the optab.  */
4331   unsigned int nelt;
4332   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4333     return false;
4334
4335   vec_perm_builder sel;
4336   vec_perm_indices indices;
4337   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4338     {
4339       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4340       indices.new_vector (sel, 2, nelt);
4341       if (!can_vec_perm_const_p (mode, indices, false))
4342         return false;
4343     }
4344   return true;
4345 }
4346
4347 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4348    functions. Design better to avoid maintenance issues.  */
4349
4350 /* Function vect_model_reduction_cost.
4351
4352    Models cost for a reduction operation, including the vector ops
4353    generated within the strip-mine loop, the initial definition before
4354    the loop, and the epilogue code that must be generated.  */
4355
4356 static void
4357 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4358                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4359                            vect_reduction_type reduction_type,
4360                            int ncopies, stmt_vector_for_cost *cost_vec)
4361 {
4362   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4363   enum tree_code code;
4364   optab optab;
4365   tree vectype;
4366   machine_mode mode;
4367   class loop *loop = NULL;
4368
4369   if (loop_vinfo)
4370     loop = LOOP_VINFO_LOOP (loop_vinfo);
4371
4372   /* Condition reductions generate two reductions in the loop.  */
4373   if (reduction_type == COND_REDUCTION)
4374     ncopies *= 2;
4375
4376   vectype = STMT_VINFO_VECTYPE (stmt_info);
4377   mode = TYPE_MODE (vectype);
4378   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4379
4380   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4381
4382   if (reduction_type == EXTRACT_LAST_REDUCTION)
4383     /* No extra instructions are needed in the prologue.  The loop body
4384        operations are costed in vectorizable_condition.  */
4385     inside_cost = 0;
4386   else if (reduction_type == FOLD_LEFT_REDUCTION)
4387     {
4388       /* No extra instructions needed in the prologue.  */
4389       prologue_cost = 0;
4390
4391       if (reduc_fn != IFN_LAST)
4392         /* Count one reduction-like operation per vector.  */
4393         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4394                                         stmt_info, 0, vect_body);
4395       else
4396         {
4397           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4398           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4399           inside_cost = record_stmt_cost (cost_vec, nelements,
4400                                           vec_to_scalar, stmt_info, 0,
4401                                           vect_body);
4402           inside_cost += record_stmt_cost (cost_vec, nelements,
4403                                            scalar_stmt, stmt_info, 0,
4404                                            vect_body);
4405         }
4406     }
4407   else
4408     {
4409       /* Add in cost for initial definition.
4410          For cond reduction we have four vectors: initial index, step,
4411          initial result of the data reduction, initial value of the index
4412          reduction.  */
4413       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4414       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4415                                          scalar_to_vec, stmt_info, 0,
4416                                          vect_prologue);
4417
4418       /* Cost of reduction op inside loop.  */
4419       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4420                                       stmt_info, 0, vect_body);
4421     }
4422
4423   /* Determine cost of epilogue code.
4424
4425      We have a reduction operator that will reduce the vector in one statement.
4426      Also requires scalar extract.  */
4427
4428   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4429     {
4430       if (reduc_fn != IFN_LAST)
4431         {
4432           if (reduction_type == COND_REDUCTION)
4433             {
4434               /* An EQ stmt and an COND_EXPR stmt.  */
4435               epilogue_cost += record_stmt_cost (cost_vec, 2,
4436                                                  vector_stmt, stmt_info, 0,
4437                                                  vect_epilogue);
4438               /* Reduction of the max index and a reduction of the found
4439                  values.  */
4440               epilogue_cost += record_stmt_cost (cost_vec, 2,
4441                                                  vec_to_scalar, stmt_info, 0,
4442                                                  vect_epilogue);
4443               /* A broadcast of the max value.  */
4444               epilogue_cost += record_stmt_cost (cost_vec, 1,
4445                                                  scalar_to_vec, stmt_info, 0,
4446                                                  vect_epilogue);
4447             }
4448           else
4449             {
4450               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4451                                                  stmt_info, 0, vect_epilogue);
4452               epilogue_cost += record_stmt_cost (cost_vec, 1,
4453                                                  vec_to_scalar, stmt_info, 0,
4454                                                  vect_epilogue);
4455             }
4456         }
4457       else if (reduction_type == COND_REDUCTION)
4458         {
4459           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4460           /* Extraction of scalar elements.  */
4461           epilogue_cost += record_stmt_cost (cost_vec,
4462                                              2 * estimated_nunits,
4463                                              vec_to_scalar, stmt_info, 0,
4464                                              vect_epilogue);
4465           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4466           epilogue_cost += record_stmt_cost (cost_vec,
4467                                              2 * estimated_nunits - 3,
4468                                              scalar_stmt, stmt_info, 0,
4469                                              vect_epilogue);
4470         }
4471       else if (reduction_type == EXTRACT_LAST_REDUCTION
4472                || reduction_type == FOLD_LEFT_REDUCTION)
4473         /* No extra instructions need in the epilogue.  */
4474         ;
4475       else
4476         {
4477           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4478           tree bitsize =
4479             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4480           int element_bitsize = tree_to_uhwi (bitsize);
4481           int nelements = vec_size_in_bits / element_bitsize;
4482
4483           if (code == COND_EXPR)
4484             code = MAX_EXPR;
4485
4486           optab = optab_for_tree_code (code, vectype, optab_default);
4487
4488           /* We have a whole vector shift available.  */
4489           if (optab != unknown_optab
4490               && VECTOR_MODE_P (mode)
4491               && optab_handler (optab, mode) != CODE_FOR_nothing
4492               && have_whole_vector_shift (mode))
4493             {
4494               /* Final reduction via vector shifts and the reduction operator.
4495                  Also requires scalar extract.  */
4496               epilogue_cost += record_stmt_cost (cost_vec,
4497                                                  exact_log2 (nelements) * 2,
4498                                                  vector_stmt, stmt_info, 0,
4499                                                  vect_epilogue);
4500               epilogue_cost += record_stmt_cost (cost_vec, 1,
4501                                                  vec_to_scalar, stmt_info, 0,
4502                                                  vect_epilogue);
4503             }
4504           else
4505             /* Use extracts and reduction op for final reduction.  For N
4506                elements, we have N extracts and N-1 reduction ops.  */
4507             epilogue_cost += record_stmt_cost (cost_vec,
4508                                                nelements + nelements - 1,
4509                                                vector_stmt, stmt_info, 0,
4510                                                vect_epilogue);
4511         }
4512     }
4513
4514   if (dump_enabled_p ())
4515     dump_printf (MSG_NOTE,
4516                  "vect_model_reduction_cost: inside_cost = %d, "
4517                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4518                  prologue_cost, epilogue_cost);
4519 }
4520
4521
4522
4523 /* Function get_initial_def_for_reduction
4524
4525    Input:
4526    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4527    INIT_VAL - the initial value of the reduction variable
4528
4529    Output:
4530    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4531         of the reduction (used for adjusting the epilog - see below).
4532    Return a vector variable, initialized according to the operation that
4533         STMT_VINFO performs. This vector will be used as the initial value
4534         of the vector of partial results.
4535
4536    Option1 (adjust in epilog): Initialize the vector as follows:
4537      add/bit or/xor:    [0,0,...,0,0]
4538      mult/bit and:      [1,1,...,1,1]
4539      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4540    and when necessary (e.g. add/mult case) let the caller know
4541    that it needs to adjust the result by init_val.
4542
4543    Option2: Initialize the vector as follows:
4544      add/bit or/xor:    [init_val,0,0,...,0]
4545      mult/bit and:      [init_val,1,1,...,1]
4546      min/max/cond_expr: [init_val,init_val,...,init_val]
4547    and no adjustments are needed.
4548
4549    For example, for the following code:
4550
4551    s = init_val;
4552    for (i=0;i<n;i++)
4553      s = s + a[i];
4554
4555    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4556    For a vector of 4 units, we want to return either [0,0,0,init_val],
4557    or [0,0,0,0] and let the caller know that it needs to adjust
4558    the result at the end by 'init_val'.
4559
4560    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4561    initialization vector is simpler (same element in all entries), if
4562    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4563
4564    A cost model should help decide between these two schemes.  */
4565
4566 static tree
4567 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4568                                stmt_vec_info stmt_vinfo,
4569                                enum tree_code code, tree init_val,
4570                                tree *adjustment_def)
4571 {
4572   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4573   tree scalar_type = TREE_TYPE (init_val);
4574   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4575   tree def_for_init;
4576   tree init_def;
4577   REAL_VALUE_TYPE real_init_val = dconst0;
4578   int int_init_val = 0;
4579   gimple_seq stmts = NULL;
4580
4581   gcc_assert (vectype);
4582
4583   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4584               || SCALAR_FLOAT_TYPE_P (scalar_type));
4585
4586   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4587               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4588
4589   /* ADJUSTMENT_DEF is NULL when called from
4590      vect_create_epilog_for_reduction to vectorize double reduction.  */
4591   if (adjustment_def)
4592     *adjustment_def = NULL;
4593
4594   switch (code)
4595     {
4596     case WIDEN_SUM_EXPR:
4597     case DOT_PROD_EXPR:
4598     case SAD_EXPR:
4599     case PLUS_EXPR:
4600     case MINUS_EXPR:
4601     case BIT_IOR_EXPR:
4602     case BIT_XOR_EXPR:
4603     case MULT_EXPR:
4604     case BIT_AND_EXPR:
4605       {
4606         if (code == MULT_EXPR)
4607           {
4608             real_init_val = dconst1;
4609             int_init_val = 1;
4610           }
4611
4612         if (code == BIT_AND_EXPR)
4613           int_init_val = -1;
4614
4615         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4616           def_for_init = build_real (scalar_type, real_init_val);
4617         else
4618           def_for_init = build_int_cst (scalar_type, int_init_val);
4619
4620         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4621           {
4622             /* Option1: the first element is '0' or '1' as well.  */
4623             if (!operand_equal_p (def_for_init, init_val, 0))
4624               *adjustment_def = init_val;
4625             init_def = gimple_build_vector_from_val (&stmts, vectype,
4626                                                      def_for_init);
4627           }
4628         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4629           {
4630             /* Option2 (variable length): the first element is INIT_VAL.  */
4631             init_def = gimple_build_vector_from_val (&stmts, vectype,
4632                                                      def_for_init);
4633             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4634                                      vectype, init_def, init_val);
4635           }
4636         else
4637           {
4638             /* Option2: the first element is INIT_VAL.  */
4639             tree_vector_builder elts (vectype, 1, 2);
4640             elts.quick_push (init_val);
4641             elts.quick_push (def_for_init);
4642             init_def = gimple_build_vector (&stmts, &elts);
4643           }
4644       }
4645       break;
4646
4647     case MIN_EXPR:
4648     case MAX_EXPR:
4649     case COND_EXPR:
4650       {
4651         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4652         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4653       }
4654       break;
4655
4656     default:
4657       gcc_unreachable ();
4658     }
4659
4660   if (stmts)
4661     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4662   return init_def;
4663 }
4664
4665 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4666    NUMBER_OF_VECTORS is the number of vector defs to create.
4667    If NEUTRAL_OP is nonnull, introducing extra elements of that
4668    value will not change the result.  */
4669
4670 static void
4671 get_initial_defs_for_reduction (vec_info *vinfo,
4672                                 slp_tree slp_node,
4673                                 vec<tree> *vec_oprnds,
4674                                 unsigned int number_of_vectors,
4675                                 bool reduc_chain, tree neutral_op)
4676 {
4677   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4678   stmt_vec_info stmt_vinfo = stmts[0];
4679   unsigned HOST_WIDE_INT nunits;
4680   unsigned j, number_of_places_left_in_vector;
4681   tree vector_type;
4682   unsigned int group_size = stmts.length ();
4683   unsigned int i;
4684   class loop *loop;
4685
4686   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4687
4688   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4689
4690   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4691   gcc_assert (loop);
4692   edge pe = loop_preheader_edge (loop);
4693
4694   gcc_assert (!reduc_chain || neutral_op);
4695
4696   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4697      created vectors. It is greater than 1 if unrolling is performed.
4698
4699      For example, we have two scalar operands, s1 and s2 (e.g., group of
4700      strided accesses of size two), while NUNITS is four (i.e., four scalars
4701      of this type can be packed in a vector).  The output vector will contain
4702      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4703      will be 2).
4704
4705      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4706      vectors containing the operands.
4707
4708      For example, NUNITS is four as before, and the group size is 8
4709      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4710      {s5, s6, s7, s8}.  */
4711
4712   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4713     nunits = group_size;
4714
4715   number_of_places_left_in_vector = nunits;
4716   bool constant_p = true;
4717   tree_vector_builder elts (vector_type, nunits, 1);
4718   elts.quick_grow (nunits);
4719   gimple_seq ctor_seq = NULL;
4720   for (j = 0; j < nunits * number_of_vectors; ++j)
4721     {
4722       tree op;
4723       i = j % group_size;
4724       stmt_vinfo = stmts[i];
4725
4726       /* Get the def before the loop.  In reduction chain we have only
4727          one initial value.  Else we have as many as PHIs in the group.  */
4728       if (reduc_chain)
4729         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4730       else if (((vec_oprnds->length () + 1) * nunits
4731                 - number_of_places_left_in_vector >= group_size)
4732                && neutral_op)
4733         op = neutral_op;
4734       else
4735         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4736
4737       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4738       number_of_places_left_in_vector--;
4739       elts[nunits - number_of_places_left_in_vector - 1] = op;
4740       if (!CONSTANT_CLASS_P (op))
4741         constant_p = false;
4742
4743       if (number_of_places_left_in_vector == 0)
4744         {
4745           tree init;
4746           if (constant_p && !neutral_op
4747               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4748               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4749             /* Build the vector directly from ELTS.  */
4750             init = gimple_build_vector (&ctor_seq, &elts);
4751           else if (neutral_op)
4752             {
4753               /* Build a vector of the neutral value and shift the
4754                  other elements into place.  */
4755               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4756                                                    neutral_op);
4757               int k = nunits;
4758               while (k > 0 && elts[k - 1] == neutral_op)
4759                 k -= 1;
4760               while (k > 0)
4761                 {
4762                   k -= 1;
4763                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4764                                        vector_type, init, elts[k]);
4765                 }
4766             }
4767           else
4768             {
4769               /* First time round, duplicate ELTS to fill the
4770                  required number of vectors.  */
4771               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4772                                         number_of_vectors, *vec_oprnds);
4773               break;
4774             }
4775           vec_oprnds->quick_push (init);
4776
4777           number_of_places_left_in_vector = nunits;
4778           elts.new_vector (vector_type, nunits, 1);
4779           elts.quick_grow (nunits);
4780           constant_p = true;
4781         }
4782     }
4783   if (ctor_seq != NULL)
4784     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4785 }
4786
4787 /* For a statement STMT_INFO taking part in a reduction operation return
4788    the stmt_vec_info the meta information is stored on.  */
4789
4790 stmt_vec_info
4791 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4792 {
4793   stmt_info = vect_orig_stmt (stmt_info);
4794   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4795   if (!is_a <gphi *> (stmt_info->stmt)
4796       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4797     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4798   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4799   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4800     {
4801       if (gimple_phi_num_args (phi) == 1)
4802         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4803     }
4804   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4805     {
4806       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4807       stmt_vec_info info
4808           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4809       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4810         stmt_info = info;
4811     }
4812   return stmt_info;
4813 }
4814
4815 /* Function vect_create_epilog_for_reduction
4816
4817    Create code at the loop-epilog to finalize the result of a reduction
4818    computation.
4819
4820    STMT_INFO is the scalar reduction stmt that is being vectorized.
4821    SLP_NODE is an SLP node containing a group of reduction statements. The
4822      first one in this group is STMT_INFO.
4823    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4824    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4825      (counting from 0)
4826
4827    This function:
4828    1. Completes the reduction def-use cycles.
4829    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4830       by calling the function specified by REDUC_FN if available, or by
4831       other means (whole-vector shifts or a scalar loop).
4832       The function also creates a new phi node at the loop exit to preserve
4833       loop-closed form, as illustrated below.
4834
4835      The flow at the entry to this function:
4836
4837         loop:
4838           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4839           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4840           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4841         loop_exit:
4842           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4843           use <s_out0>
4844           use <s_out0>
4845
4846      The above is transformed by this function into:
4847
4848         loop:
4849           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4850           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4851           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4852         loop_exit:
4853           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4854           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4855           v_out2 = reduce <v_out1>
4856           s_out3 = extract_field <v_out2, 0>
4857           s_out4 = adjust_result <s_out3>
4858           use <s_out4>
4859           use <s_out4>
4860 */
4861
4862 static void
4863 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4864                                   stmt_vec_info stmt_info,
4865                                   slp_tree slp_node,
4866                                   slp_instance slp_node_instance)
4867 {
4868   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4869   gcc_assert (reduc_info->is_reduc_info);
4870   /* For double reductions we need to get at the inner loop reduction
4871      stmt which has the meta info attached.  Our stmt_info is that of the
4872      loop-closed PHI of the inner loop which we remember as
4873      def for the reduction PHI generation.  */
4874   bool double_reduc = false;
4875   stmt_vec_info rdef_info = stmt_info;
4876   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4877     {
4878       gcc_assert (!slp_node);
4879       double_reduc = true;
4880       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4881                                             (stmt_info->stmt, 0));
4882       stmt_info = vect_stmt_to_vectorize (stmt_info);
4883     }
4884   gphi *reduc_def_stmt
4885     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4886   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4887   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4888   tree vectype;
4889   machine_mode mode;
4890   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4891   basic_block exit_bb;
4892   tree scalar_dest;
4893   tree scalar_type;
4894   gimple *new_phi = NULL, *phi;
4895   gimple_stmt_iterator exit_gsi;
4896   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4897   gimple *epilog_stmt = NULL;
4898   gimple *exit_phi;
4899   tree bitsize;
4900   tree def;
4901   tree orig_name, scalar_result;
4902   imm_use_iterator imm_iter, phi_imm_iter;
4903   use_operand_p use_p, phi_use_p;
4904   gimple *use_stmt;
4905   bool nested_in_vect_loop = false;
4906   auto_vec<gimple *> new_phis;
4907   int j, i;
4908   auto_vec<tree> scalar_results;
4909   unsigned int group_size = 1, k;
4910   auto_vec<gimple *> phis;
4911   bool slp_reduc = false;
4912   bool direct_slp_reduc;
4913   tree new_phi_result;
4914   tree induction_index = NULL_TREE;
4915
4916   if (slp_node)
4917     group_size = SLP_TREE_LANES (slp_node);
4918
4919   if (nested_in_vect_loop_p (loop, stmt_info))
4920     {
4921       outer_loop = loop;
4922       loop = loop->inner;
4923       nested_in_vect_loop = true;
4924       gcc_assert (!slp_node);
4925     }
4926   gcc_assert (!nested_in_vect_loop || double_reduc);
4927
4928   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4929   gcc_assert (vectype);
4930   mode = TYPE_MODE (vectype);
4931
4932   tree initial_def = NULL;
4933   tree induc_val = NULL_TREE;
4934   tree adjustment_def = NULL;
4935   if (slp_node)
4936     ;
4937   else
4938     {
4939       /* Get at the scalar def before the loop, that defines the initial value
4940          of the reduction variable.  */
4941       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4942                                            loop_preheader_edge (loop));
4943       /* Optimize: for induction condition reduction, if we can't use zero
4944          for induc_val, use initial_def.  */
4945       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4946         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4947       else if (double_reduc)
4948         ;
4949       else if (nested_in_vect_loop)
4950         ;
4951       else
4952         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4953     }
4954
4955   unsigned vec_num;
4956   int ncopies;
4957   if (slp_node)
4958     {
4959       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4960       ncopies = 1;
4961     }
4962   else
4963     {
4964       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4965       vec_num = 1;
4966       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4967     }
4968
4969   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4970      which is updated with the current index of the loop for every match of
4971      the original loop's cond_expr (VEC_STMT).  This results in a vector
4972      containing the last time the condition passed for that vector lane.
4973      The first match will be a 1 to allow 0 to be used for non-matching
4974      indexes.  If there are no matches at all then the vector will be all
4975      zeroes.
4976
4977      PR92772: This algorithm is broken for architectures that support
4978      masked vectors, but do not provide fold_extract_last.  */
4979   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4980     {
4981       auto_vec<std::pair<tree, bool>, 2> ccompares;
4982       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4983       cond_info = vect_stmt_to_vectorize (cond_info);
4984       while (cond_info != reduc_info)
4985         {
4986           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4987             {
4988               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4989               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4990               ccompares.safe_push
4991                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4992                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4993             }
4994           cond_info
4995             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4996                                                  1 + STMT_VINFO_REDUC_IDX
4997                                                         (cond_info)));
4998           cond_info = vect_stmt_to_vectorize (cond_info);
4999         }
5000       gcc_assert (ccompares.length () != 0);
5001
5002       tree indx_before_incr, indx_after_incr;
5003       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5004       int scalar_precision
5005         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5006       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5007       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5008         (TYPE_MODE (vectype), cr_index_scalar_type,
5009          TYPE_VECTOR_SUBPARTS (vectype));
5010
5011       /* First we create a simple vector induction variable which starts
5012          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5013          vector size (STEP).  */
5014
5015       /* Create a {1,2,3,...} vector.  */
5016       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5017
5018       /* Create a vector of the step value.  */
5019       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5020       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5021
5022       /* Create an induction variable.  */
5023       gimple_stmt_iterator incr_gsi;
5024       bool insert_after;
5025       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5026       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5027                  insert_after, &indx_before_incr, &indx_after_incr);
5028
5029       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5030          filled with zeros (VEC_ZERO).  */
5031
5032       /* Create a vector of 0s.  */
5033       tree zero = build_zero_cst (cr_index_scalar_type);
5034       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5035
5036       /* Create a vector phi node.  */
5037       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5038       new_phi = create_phi_node (new_phi_tree, loop->header);
5039       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5040                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5041
5042       /* Now take the condition from the loops original cond_exprs
5043          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5044          every match uses values from the induction variable
5045          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5046          (NEW_PHI_TREE).
5047          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5048          the new cond_expr (INDEX_COND_EXPR).  */
5049       gimple_seq stmts = NULL;
5050       for (int i = ccompares.length () - 1; i != -1; --i)
5051         {
5052           tree ccompare = ccompares[i].first;
5053           if (ccompares[i].second)
5054             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5055                                          cr_index_vector_type,
5056                                          ccompare,
5057                                          indx_before_incr, new_phi_tree);
5058           else
5059             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5060                                          cr_index_vector_type,
5061                                          ccompare,
5062                                          new_phi_tree, indx_before_incr);
5063         }
5064       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5065
5066       /* Update the phi with the vec cond.  */
5067       induction_index = new_phi_tree;
5068       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5069                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5070     }
5071
5072   /* 2. Create epilog code.
5073         The reduction epilog code operates across the elements of the vector
5074         of partial results computed by the vectorized loop.
5075         The reduction epilog code consists of:
5076
5077         step 1: compute the scalar result in a vector (v_out2)
5078         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5079         step 3: adjust the scalar result (s_out3) if needed.
5080
5081         Step 1 can be accomplished using one the following three schemes:
5082           (scheme 1) using reduc_fn, if available.
5083           (scheme 2) using whole-vector shifts, if available.
5084           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5085                      combined.
5086
5087           The overall epilog code looks like this:
5088
5089           s_out0 = phi <s_loop>         # original EXIT_PHI
5090           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5091           v_out2 = reduce <v_out1>              # step 1
5092           s_out3 = extract_field <v_out2, 0>    # step 2
5093           s_out4 = adjust_result <s_out3>       # step 3
5094
5095           (step 3 is optional, and steps 1 and 2 may be combined).
5096           Lastly, the uses of s_out0 are replaced by s_out4.  */
5097
5098
5099   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5100          v_out1 = phi <VECT_DEF>
5101          Store them in NEW_PHIS.  */
5102   if (double_reduc)
5103     loop = outer_loop;
5104   exit_bb = single_exit (loop)->dest;
5105   new_phis.create (slp_node ? vec_num : ncopies);
5106   for (unsigned i = 0; i < vec_num; i++)
5107     {
5108       if (slp_node)
5109         def = vect_get_slp_vect_def (slp_node, i);
5110       else
5111         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5112       for (j = 0; j < ncopies; j++)
5113         {
5114           tree new_def = copy_ssa_name (def);
5115           phi = create_phi_node (new_def, exit_bb);
5116           if (j == 0)
5117             new_phis.quick_push (phi);
5118           else
5119             {
5120               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5121               new_phis.quick_push (phi);
5122             }
5123
5124           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5125         }
5126     }
5127
5128   exit_gsi = gsi_after_labels (exit_bb);
5129
5130   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5131          (i.e. when reduc_fn is not available) and in the final adjustment
5132          code (if needed).  Also get the original scalar reduction variable as
5133          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5134          represents a reduction pattern), the tree-code and scalar-def are
5135          taken from the original stmt that the pattern-stmt (STMT) replaces.
5136          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5137          are taken from STMT.  */
5138
5139   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5140   if (orig_stmt_info != stmt_info)
5141     {
5142       /* Reduction pattern  */
5143       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5144       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5145     }
5146
5147   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5148   scalar_type = TREE_TYPE (scalar_dest);
5149   scalar_results.create (group_size);
5150   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5151   bitsize = TYPE_SIZE (scalar_type);
5152
5153   /* SLP reduction without reduction chain, e.g.,
5154      # a1 = phi <a2, a0>
5155      # b1 = phi <b2, b0>
5156      a2 = operation (a1)
5157      b2 = operation (b1)  */
5158   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5159
5160   /* True if we should implement SLP_REDUC using native reduction operations
5161      instead of scalar operations.  */
5162   direct_slp_reduc = (reduc_fn != IFN_LAST
5163                       && slp_reduc
5164                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5165
5166   /* In case of reduction chain, e.g.,
5167      # a1 = phi <a3, a0>
5168      a2 = operation (a1)
5169      a3 = operation (a2),
5170
5171      we may end up with more than one vector result.  Here we reduce them to
5172      one vector.  */
5173   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5174     {
5175       gimple_seq stmts = NULL;
5176       tree first_vect = PHI_RESULT (new_phis[0]);
5177       first_vect = gimple_convert (&stmts, vectype, first_vect);
5178       for (k = 1; k < new_phis.length (); k++)
5179         {
5180           gimple *next_phi = new_phis[k];
5181           tree second_vect = PHI_RESULT (next_phi);
5182           second_vect = gimple_convert (&stmts, vectype, second_vect);
5183           first_vect = gimple_build (&stmts, code, vectype,
5184                                      first_vect, second_vect);
5185         }
5186       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5187
5188       new_phi_result = first_vect;
5189       new_phis.truncate (0);
5190       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5191     }
5192   /* Likewise if we couldn't use a single defuse cycle.  */
5193   else if (ncopies > 1)
5194     {
5195       gimple_seq stmts = NULL;
5196       tree first_vect = PHI_RESULT (new_phis[0]);
5197       first_vect = gimple_convert (&stmts, vectype, first_vect);
5198       for (int k = 1; k < ncopies; ++k)
5199         {
5200           tree second_vect = PHI_RESULT (new_phis[k]);
5201           second_vect = gimple_convert (&stmts, vectype, second_vect);
5202           first_vect = gimple_build (&stmts, code, vectype,
5203                                      first_vect, second_vect);
5204         }
5205       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5206       new_phi_result = first_vect;
5207       new_phis.truncate (0);
5208       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5209     }
5210   else
5211     new_phi_result = PHI_RESULT (new_phis[0]);
5212
5213   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5214       && reduc_fn != IFN_LAST)
5215     {
5216       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5217          various data values where the condition matched and another vector
5218          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5219          need to extract the last matching index (which will be the index with
5220          highest value) and use this to index into the data vector.
5221          For the case where there were no matches, the data vector will contain
5222          all default values and the index vector will be all zeros.  */
5223
5224       /* Get various versions of the type of the vector of indexes.  */
5225       tree index_vec_type = TREE_TYPE (induction_index);
5226       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5227       tree index_scalar_type = TREE_TYPE (index_vec_type);
5228       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5229
5230       /* Get an unsigned integer version of the type of the data vector.  */
5231       int scalar_precision
5232         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5233       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5234       tree vectype_unsigned = build_vector_type
5235         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5236
5237       /* First we need to create a vector (ZERO_VEC) of zeros and another
5238          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5239          can create using a MAX reduction and then expanding.
5240          In the case where the loop never made any matches, the max index will
5241          be zero.  */
5242
5243       /* Vector of {0, 0, 0,...}.  */
5244       tree zero_vec = build_zero_cst (vectype);
5245
5246       gimple_seq stmts = NULL;
5247       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5248       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5249
5250       /* Find maximum value from the vector of found indexes.  */
5251       tree max_index = make_ssa_name (index_scalar_type);
5252       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5253                                                           1, induction_index);
5254       gimple_call_set_lhs (max_index_stmt, max_index);
5255       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5256
5257       /* Vector of {max_index, max_index, max_index,...}.  */
5258       tree max_index_vec = make_ssa_name (index_vec_type);
5259       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5260                                                       max_index);
5261       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5262                                                         max_index_vec_rhs);
5263       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5264
5265       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5266          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5267          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5268          otherwise.  Only one value should match, resulting in a vector
5269          (VEC_COND) with one data value and the rest zeros.
5270          In the case where the loop never made any matches, every index will
5271          match, resulting in a vector with all data values (which will all be
5272          the default value).  */
5273
5274       /* Compare the max index vector to the vector of found indexes to find
5275          the position of the max value.  */
5276       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5277       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5278                                                       induction_index,
5279                                                       max_index_vec);
5280       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5281
5282       /* Use the compare to choose either values from the data vector or
5283          zero.  */
5284       tree vec_cond = make_ssa_name (vectype);
5285       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5286                                                    vec_compare, new_phi_result,
5287                                                    zero_vec);
5288       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5289
5290       /* Finally we need to extract the data value from the vector (VEC_COND)
5291          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5292          reduction, but because this doesn't exist, we can use a MAX reduction
5293          instead.  The data value might be signed or a float so we need to cast
5294          it first.
5295          In the case where the loop never made any matches, the data values are
5296          all identical, and so will reduce down correctly.  */
5297
5298       /* Make the matched data values unsigned.  */
5299       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5300       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5301                                        vec_cond);
5302       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5303                                                         VIEW_CONVERT_EXPR,
5304                                                         vec_cond_cast_rhs);
5305       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5306
5307       /* Reduce down to a scalar value.  */
5308       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5309       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5310                                                            1, vec_cond_cast);
5311       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5312       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5313
5314       /* Convert the reduced value back to the result type and set as the
5315          result.  */
5316       stmts = NULL;
5317       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5318                                data_reduc);
5319       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5320       scalar_results.safe_push (new_temp);
5321     }
5322   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5323            && reduc_fn == IFN_LAST)
5324     {
5325       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5326          idx = 0;
5327          idx_val = induction_index[0];
5328          val = data_reduc[0];
5329          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5330            if (induction_index[i] > idx_val)
5331              val = data_reduc[i], idx_val = induction_index[i];
5332          return val;  */
5333
5334       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5335       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5336       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5337       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5338       /* Enforced by vectorizable_reduction, which ensures we have target
5339          support before allowing a conditional reduction on variable-length
5340          vectors.  */
5341       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5342       tree idx_val = NULL_TREE, val = NULL_TREE;
5343       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5344         {
5345           tree old_idx_val = idx_val;
5346           tree old_val = val;
5347           idx_val = make_ssa_name (idx_eltype);
5348           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5349                                              build3 (BIT_FIELD_REF, idx_eltype,
5350                                                      induction_index,
5351                                                      bitsize_int (el_size),
5352                                                      bitsize_int (off)));
5353           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5354           val = make_ssa_name (data_eltype);
5355           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5356                                              build3 (BIT_FIELD_REF,
5357                                                      data_eltype,
5358                                                      new_phi_result,
5359                                                      bitsize_int (el_size),
5360                                                      bitsize_int (off)));
5361           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362           if (off != 0)
5363             {
5364               tree new_idx_val = idx_val;
5365               if (off != v_size - el_size)
5366                 {
5367                   new_idx_val = make_ssa_name (idx_eltype);
5368                   epilog_stmt = gimple_build_assign (new_idx_val,
5369                                                      MAX_EXPR, idx_val,
5370                                                      old_idx_val);
5371                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5372                 }
5373               tree new_val = make_ssa_name (data_eltype);
5374               epilog_stmt = gimple_build_assign (new_val,
5375                                                  COND_EXPR,
5376                                                  build2 (GT_EXPR,
5377                                                          boolean_type_node,
5378                                                          idx_val,
5379                                                          old_idx_val),
5380                                                  val, old_val);
5381               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382               idx_val = new_idx_val;
5383               val = new_val;
5384             }
5385         }
5386       /* Convert the reduced value back to the result type and set as the
5387          result.  */
5388       gimple_seq stmts = NULL;
5389       val = gimple_convert (&stmts, scalar_type, val);
5390       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5391       scalar_results.safe_push (val);
5392     }
5393
5394   /* 2.3 Create the reduction code, using one of the three schemes described
5395          above. In SLP we simply need to extract all the elements from the
5396          vector (without reducing them), so we use scalar shifts.  */
5397   else if (reduc_fn != IFN_LAST && !slp_reduc)
5398     {
5399       tree tmp;
5400       tree vec_elem_type;
5401
5402       /* Case 1:  Create:
5403          v_out2 = reduc_expr <v_out1>  */
5404
5405       if (dump_enabled_p ())
5406         dump_printf_loc (MSG_NOTE, vect_location,
5407                          "Reduce using direct vector reduction.\n");
5408
5409       gimple_seq stmts = NULL;
5410       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5411       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5412       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5413                                vec_elem_type, new_phi_result);
5414       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5415       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5416
5417       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5418           && induc_val)
5419         {
5420           /* Earlier we set the initial value to be a vector if induc_val
5421              values.  Check the result and if it is induc_val then replace
5422              with the original initial value, unless induc_val is
5423              the same as initial_def already.  */
5424           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5425                                   induc_val);
5426
5427           tmp = make_ssa_name (new_scalar_dest);
5428           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5429                                              initial_def, new_temp);
5430           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5431           new_temp = tmp;
5432         }
5433
5434       scalar_results.safe_push (new_temp);
5435     }
5436   else if (direct_slp_reduc)
5437     {
5438       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5439          with the elements for other SLP statements replaced with the
5440          neutral value.  We can then do a normal reduction on each vector.  */
5441
5442       /* Enforced by vectorizable_reduction.  */
5443       gcc_assert (new_phis.length () == 1);
5444       gcc_assert (pow2p_hwi (group_size));
5445
5446       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5447       vec<stmt_vec_info> orig_phis
5448         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5449       gimple_seq seq = NULL;
5450
5451       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5452          and the same element size as VECTYPE.  */
5453       tree index = build_index_vector (vectype, 0, 1);
5454       tree index_type = TREE_TYPE (index);
5455       tree index_elt_type = TREE_TYPE (index_type);
5456       tree mask_type = truth_type_for (index_type);
5457
5458       /* Create a vector that, for each element, identifies which of
5459          the REDUC_GROUP_SIZE results should use it.  */
5460       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5461       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5462                             build_vector_from_val (index_type, index_mask));
5463
5464       /* Get a neutral vector value.  This is simply a splat of the neutral
5465          scalar value if we have one, otherwise the initial scalar value
5466          is itself a neutral value.  */
5467       tree vector_identity = NULL_TREE;
5468       tree neutral_op = NULL_TREE;
5469       if (slp_node)
5470         {
5471           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5472           neutral_op
5473             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5474                                             vectype, code, first != NULL);
5475         }
5476       if (neutral_op)
5477         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5478                                                         neutral_op);
5479       for (unsigned int i = 0; i < group_size; ++i)
5480         {
5481           /* If there's no univeral neutral value, we can use the
5482              initial scalar value from the original PHI.  This is used
5483              for MIN and MAX reduction, for example.  */
5484           if (!neutral_op)
5485             {
5486               tree scalar_value
5487                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5488                                          loop_preheader_edge (loop));
5489               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5490                                              scalar_value);
5491               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5492                                                               scalar_value);
5493             }
5494
5495           /* Calculate the equivalent of:
5496
5497              sel[j] = (index[j] == i);
5498
5499              which selects the elements of NEW_PHI_RESULT that should
5500              be included in the result.  */
5501           tree compare_val = build_int_cst (index_elt_type, i);
5502           compare_val = build_vector_from_val (index_type, compare_val);
5503           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5504                                    index, compare_val);
5505
5506           /* Calculate the equivalent of:
5507
5508              vec = seq ? new_phi_result : vector_identity;
5509
5510              VEC is now suitable for a full vector reduction.  */
5511           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5512                                    sel, new_phi_result, vector_identity);
5513
5514           /* Do the reduction and convert it to the appropriate type.  */
5515           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5516                                       TREE_TYPE (vectype), vec);
5517           scalar = gimple_convert (&seq, scalar_type, scalar);
5518           scalar_results.safe_push (scalar);
5519         }
5520       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5521     }
5522   else
5523     {
5524       bool reduce_with_shift;
5525       tree vec_temp;
5526
5527       gcc_assert (slp_reduc || new_phis.length () == 1);
5528
5529       /* See if the target wants to do the final (shift) reduction
5530          in a vector mode of smaller size and first reduce upper/lower
5531          halves against each other.  */
5532       enum machine_mode mode1 = mode;
5533       tree stype = TREE_TYPE (vectype);
5534       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5535       unsigned nunits1 = nunits;
5536       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5537           && new_phis.length () == 1)
5538         {
5539           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5540           /* For SLP reductions we have to make sure lanes match up, but
5541              since we're doing individual element final reduction reducing
5542              vector width here is even more important.
5543              ???  We can also separate lanes with permutes, for the common
5544              case of power-of-two group-size odd/even extracts would work.  */
5545           if (slp_reduc && nunits != nunits1)
5546             {
5547               nunits1 = least_common_multiple (nunits1, group_size);
5548               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5549             }
5550         }
5551       if (!slp_reduc
5552           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5553         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5554
5555       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5556                                                            stype, nunits1);
5557       reduce_with_shift = have_whole_vector_shift (mode1);
5558       if (!VECTOR_MODE_P (mode1))
5559         reduce_with_shift = false;
5560       else
5561         {
5562           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5563           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5564             reduce_with_shift = false;
5565         }
5566
5567       /* First reduce the vector to the desired vector size we should
5568          do shift reduction on by combining upper and lower halves.  */
5569       new_temp = new_phi_result;
5570       while (nunits > nunits1)
5571         {
5572           nunits /= 2;
5573           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5574                                                           stype, nunits);
5575           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5576
5577           /* The target has to make sure we support lowpart/highpart
5578              extraction, either via direct vector extract or through
5579              an integer mode punning.  */
5580           tree dst1, dst2;
5581           if (convert_optab_handler (vec_extract_optab,
5582                                      TYPE_MODE (TREE_TYPE (new_temp)),
5583                                      TYPE_MODE (vectype1))
5584               != CODE_FOR_nothing)
5585             {
5586               /* Extract sub-vectors directly once vec_extract becomes
5587                  a conversion optab.  */
5588               dst1 = make_ssa_name (vectype1);
5589               epilog_stmt
5590                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5591                                          build3 (BIT_FIELD_REF, vectype1,
5592                                                  new_temp, TYPE_SIZE (vectype1),
5593                                                  bitsize_int (0)));
5594               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5595               dst2 =  make_ssa_name (vectype1);
5596               epilog_stmt
5597                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5598                                          build3 (BIT_FIELD_REF, vectype1,
5599                                                  new_temp, TYPE_SIZE (vectype1),
5600                                                  bitsize_int (bitsize)));
5601               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5602             }
5603           else
5604             {
5605               /* Extract via punning to appropriately sized integer mode
5606                  vector.  */
5607               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5608               tree etype = build_vector_type (eltype, 2);
5609               gcc_assert (convert_optab_handler (vec_extract_optab,
5610                                                  TYPE_MODE (etype),
5611                                                  TYPE_MODE (eltype))
5612                           != CODE_FOR_nothing);
5613               tree tem = make_ssa_name (etype);
5614               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5615                                                  build1 (VIEW_CONVERT_EXPR,
5616                                                          etype, new_temp));
5617               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5618               new_temp = tem;
5619               tem = make_ssa_name (eltype);
5620               epilog_stmt
5621                   = gimple_build_assign (tem, BIT_FIELD_REF,
5622                                          build3 (BIT_FIELD_REF, eltype,
5623                                                  new_temp, TYPE_SIZE (eltype),
5624                                                  bitsize_int (0)));
5625               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5626               dst1 = make_ssa_name (vectype1);
5627               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5628                                                  build1 (VIEW_CONVERT_EXPR,
5629                                                          vectype1, tem));
5630               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5631               tem = make_ssa_name (eltype);
5632               epilog_stmt
5633                   = gimple_build_assign (tem, BIT_FIELD_REF,
5634                                          build3 (BIT_FIELD_REF, eltype,
5635                                                  new_temp, TYPE_SIZE (eltype),
5636                                                  bitsize_int (bitsize)));
5637               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5638               dst2 =  make_ssa_name (vectype1);
5639               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5640                                                  build1 (VIEW_CONVERT_EXPR,
5641                                                          vectype1, tem));
5642               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5643             }
5644
5645           new_temp = make_ssa_name (vectype1);
5646           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5647           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5648           new_phis[0] = epilog_stmt;
5649         }
5650
5651       if (reduce_with_shift && !slp_reduc)
5652         {
5653           int element_bitsize = tree_to_uhwi (bitsize);
5654           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5655              for variable-length vectors and also requires direct target support
5656              for loop reductions.  */
5657           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5658           int nelements = vec_size_in_bits / element_bitsize;
5659           vec_perm_builder sel;
5660           vec_perm_indices indices;
5661
5662           int elt_offset;
5663
5664           tree zero_vec = build_zero_cst (vectype1);
5665           /* Case 2: Create:
5666              for (offset = nelements/2; offset >= 1; offset/=2)
5667                 {
5668                   Create:  va' = vec_shift <va, offset>
5669                   Create:  va = vop <va, va'>
5670                 }  */
5671
5672           tree rhs;
5673
5674           if (dump_enabled_p ())
5675             dump_printf_loc (MSG_NOTE, vect_location,
5676                              "Reduce using vector shifts\n");
5677
5678           gimple_seq stmts = NULL;
5679           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5680           for (elt_offset = nelements / 2;
5681                elt_offset >= 1;
5682                elt_offset /= 2)
5683             {
5684               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5685               indices.new_vector (sel, 2, nelements);
5686               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5687               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5688                                        new_temp, zero_vec, mask);
5689               new_temp = gimple_build (&stmts, code,
5690                                        vectype1, new_name, new_temp);
5691             }
5692           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5693
5694           /* 2.4  Extract the final scalar result.  Create:
5695              s_out3 = extract_field <v_out2, bitpos>  */
5696
5697           if (dump_enabled_p ())
5698             dump_printf_loc (MSG_NOTE, vect_location,
5699                              "extract scalar result\n");
5700
5701           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5702                         bitsize, bitsize_zero_node);
5703           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5704           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5705           gimple_assign_set_lhs (epilog_stmt, new_temp);
5706           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5707           scalar_results.safe_push (new_temp);
5708         }
5709       else
5710         {
5711           /* Case 3: Create:
5712              s = extract_field <v_out2, 0>
5713              for (offset = element_size;
5714                   offset < vector_size;
5715                   offset += element_size;)
5716                {
5717                  Create:  s' = extract_field <v_out2, offset>
5718                  Create:  s = op <s, s'>  // For non SLP cases
5719                }  */
5720
5721           if (dump_enabled_p ())
5722             dump_printf_loc (MSG_NOTE, vect_location,
5723                              "Reduce using scalar code.\n");
5724
5725           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5726           int element_bitsize = tree_to_uhwi (bitsize);
5727           tree compute_type = TREE_TYPE (vectype);
5728           gimple_seq stmts = NULL;
5729           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5730             {
5731               int bit_offset;
5732               if (gimple_code (new_phi) == GIMPLE_PHI)
5733                 vec_temp = PHI_RESULT (new_phi);
5734               else
5735                 vec_temp = gimple_assign_lhs (new_phi);
5736               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5737                                        vec_temp, bitsize, bitsize_zero_node);
5738
5739               /* In SLP we don't need to apply reduction operation, so we just
5740                  collect s' values in SCALAR_RESULTS.  */
5741               if (slp_reduc)
5742                 scalar_results.safe_push (new_temp);
5743
5744               for (bit_offset = element_bitsize;
5745                    bit_offset < vec_size_in_bits;
5746                    bit_offset += element_bitsize)
5747                 {
5748                   tree bitpos = bitsize_int (bit_offset);
5749                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5750                                            compute_type, vec_temp,
5751                                            bitsize, bitpos);
5752                   if (slp_reduc)
5753                     {
5754                       /* In SLP we don't need to apply reduction operation, so
5755                          we just collect s' values in SCALAR_RESULTS.  */
5756                       new_temp = new_name;
5757                       scalar_results.safe_push (new_name);
5758                     }
5759                   else
5760                     new_temp = gimple_build (&stmts, code, compute_type,
5761                                              new_name, new_temp);
5762                 }
5763             }
5764
5765           /* The only case where we need to reduce scalar results in SLP, is
5766              unrolling.  If the size of SCALAR_RESULTS is greater than
5767              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5768              REDUC_GROUP_SIZE.  */
5769           if (slp_reduc)
5770             {
5771               tree res, first_res, new_res;
5772
5773               /* Reduce multiple scalar results in case of SLP unrolling.  */
5774               for (j = group_size; scalar_results.iterate (j, &res);
5775                    j++)
5776                 {
5777                   first_res = scalar_results[j % group_size];
5778                   new_res = gimple_build (&stmts, code, compute_type,
5779                                           first_res, res);
5780                   scalar_results[j % group_size] = new_res;
5781                 }
5782               for (k = 0; k < group_size; k++)
5783                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5784                                                     scalar_results[k]);
5785             }
5786           else
5787             {
5788               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5789               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5790               scalar_results.safe_push (new_temp);
5791             }
5792
5793           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5794         }
5795
5796       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5797           && induc_val)
5798         {
5799           /* Earlier we set the initial value to be a vector if induc_val
5800              values.  Check the result and if it is induc_val then replace
5801              with the original initial value, unless induc_val is
5802              the same as initial_def already.  */
5803           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5804                                   induc_val);
5805
5806           tree tmp = make_ssa_name (new_scalar_dest);
5807           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5808                                              initial_def, new_temp);
5809           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5810           scalar_results[0] = tmp;
5811         }
5812     }
5813
5814   /* 2.5 Adjust the final result by the initial value of the reduction
5815          variable. (When such adjustment is not needed, then
5816          'adjustment_def' is zero).  For example, if code is PLUS we create:
5817          new_temp = loop_exit_def + adjustment_def  */
5818
5819   if (adjustment_def)
5820     {
5821       gcc_assert (!slp_reduc);
5822       gimple_seq stmts = NULL;
5823       if (nested_in_vect_loop)
5824         {
5825           new_phi = new_phis[0];
5826           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5827           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5828           new_temp = gimple_build (&stmts, code, vectype,
5829                                    PHI_RESULT (new_phi), adjustment_def);
5830         }
5831       else
5832         {
5833           new_temp = scalar_results[0];
5834           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5835           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5836           new_temp = gimple_build (&stmts, code, scalar_type,
5837                                    new_temp, adjustment_def);
5838         }
5839
5840       epilog_stmt = gimple_seq_last_stmt (stmts);
5841       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5842       if (nested_in_vect_loop)
5843         {
5844           if (!double_reduc)
5845             scalar_results.quick_push (new_temp);
5846           else
5847             scalar_results[0] = new_temp;
5848         }
5849       else
5850         scalar_results[0] = new_temp;
5851
5852       new_phis[0] = epilog_stmt;
5853     }
5854
5855   if (double_reduc)
5856     loop = loop->inner;
5857
5858   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5859           phis with new adjusted scalar results, i.e., replace use <s_out0>
5860           with use <s_out4>.
5861
5862      Transform:
5863         loop_exit:
5864           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5865           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5866           v_out2 = reduce <v_out1>
5867           s_out3 = extract_field <v_out2, 0>
5868           s_out4 = adjust_result <s_out3>
5869           use <s_out0>
5870           use <s_out0>
5871
5872      into:
5873
5874         loop_exit:
5875           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5876           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5877           v_out2 = reduce <v_out1>
5878           s_out3 = extract_field <v_out2, 0>
5879           s_out4 = adjust_result <s_out3>
5880           use <s_out4>
5881           use <s_out4> */
5882
5883
5884   /* In SLP reduction chain we reduce vector results into one vector if
5885      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5886      LHS of the last stmt in the reduction chain, since we are looking for
5887      the loop exit phi node.  */
5888   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5889     {
5890       stmt_vec_info dest_stmt_info
5891         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5892       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5893       group_size = 1;
5894     }
5895
5896   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5897      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5898      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5899      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5900      correspond to the first vector stmt, etc.
5901      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5902   if (group_size > new_phis.length ())
5903     gcc_assert (!(group_size % new_phis.length ()));
5904
5905   for (k = 0; k < group_size; k++)
5906     {
5907       if (slp_reduc)
5908         {
5909           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5910
5911           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5912           /* SLP statements can't participate in patterns.  */
5913           gcc_assert (!orig_stmt_info);
5914           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5915         }
5916
5917       if (nested_in_vect_loop)
5918         {
5919           if (double_reduc)
5920             loop = outer_loop;
5921           else
5922             gcc_unreachable ();
5923         }
5924
5925       phis.create (3);
5926       /* Find the loop-closed-use at the loop exit of the original scalar
5927          result.  (The reduction result is expected to have two immediate uses,
5928          one at the latch block, and one at the loop exit).  For double
5929          reductions we are looking for exit phis of the outer loop.  */
5930       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5931         {
5932           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5933             {
5934               if (!is_gimple_debug (USE_STMT (use_p)))
5935                 phis.safe_push (USE_STMT (use_p));
5936             }
5937           else
5938             {
5939               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5940                 {
5941                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5942
5943                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5944                     {
5945                       if (!flow_bb_inside_loop_p (loop,
5946                                              gimple_bb (USE_STMT (phi_use_p)))
5947                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5948                         phis.safe_push (USE_STMT (phi_use_p));
5949                     }
5950                 }
5951             }
5952         }
5953
5954       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5955         {
5956           /* Replace the uses:  */
5957           orig_name = PHI_RESULT (exit_phi);
5958           scalar_result = scalar_results[k];
5959           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5960             {
5961               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5962                 SET_USE (use_p, scalar_result);
5963               update_stmt (use_stmt);
5964             }
5965         }
5966
5967       phis.release ();
5968     }
5969 }
5970
5971 /* Return a vector of type VECTYPE that is equal to the vector select
5972    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5973    before GSI.  */
5974
5975 static tree
5976 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5977                      tree vec, tree identity)
5978 {
5979   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5980   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5981                                           mask, vec, identity);
5982   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5983   return cond;
5984 }
5985
5986 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5987    order, starting with LHS.  Insert the extraction statements before GSI and
5988    associate the new scalar SSA names with variable SCALAR_DEST.
5989    Return the SSA name for the result.  */
5990
5991 static tree
5992 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5993                        tree_code code, tree lhs, tree vector_rhs)
5994 {
5995   tree vectype = TREE_TYPE (vector_rhs);
5996   tree scalar_type = TREE_TYPE (vectype);
5997   tree bitsize = TYPE_SIZE (scalar_type);
5998   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5999   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6000
6001   for (unsigned HOST_WIDE_INT bit_offset = 0;
6002        bit_offset < vec_size_in_bits;
6003        bit_offset += element_bitsize)
6004     {
6005       tree bitpos = bitsize_int (bit_offset);
6006       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6007                          bitsize, bitpos);
6008
6009       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6010       rhs = make_ssa_name (scalar_dest, stmt);
6011       gimple_assign_set_lhs (stmt, rhs);
6012       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6013
6014       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6015       tree new_name = make_ssa_name (scalar_dest, stmt);
6016       gimple_assign_set_lhs (stmt, new_name);
6017       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6018       lhs = new_name;
6019     }
6020   return lhs;
6021 }
6022
6023 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6024    type of the vector input.  */
6025
6026 static internal_fn
6027 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6028 {
6029   internal_fn mask_reduc_fn;
6030
6031   switch (reduc_fn)
6032     {
6033     case IFN_FOLD_LEFT_PLUS:
6034       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6035       break;
6036
6037     default:
6038       return IFN_LAST;
6039     }
6040
6041   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6042                                       OPTIMIZE_FOR_SPEED))
6043     return mask_reduc_fn;
6044   return IFN_LAST;
6045 }
6046
6047 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6048    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6049    statement.  CODE is the operation performed by STMT_INFO and OPS are
6050    its scalar operands.  REDUC_INDEX is the index of the operand in
6051    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6052    implements in-order reduction, or IFN_LAST if we should open-code it.
6053    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6054    that should be used to control the operation in a fully-masked loop.  */
6055
6056 static bool
6057 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6058                                stmt_vec_info stmt_info,
6059                                gimple_stmt_iterator *gsi,
6060                                gimple **vec_stmt, slp_tree slp_node,
6061                                gimple *reduc_def_stmt,
6062                                tree_code code, internal_fn reduc_fn,
6063                                tree ops[3], tree vectype_in,
6064                                int reduc_index, vec_loop_masks *masks)
6065 {
6066   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6067   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6068   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6069
6070   int ncopies;
6071   if (slp_node)
6072     ncopies = 1;
6073   else
6074     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6075
6076   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6077   gcc_assert (ncopies == 1);
6078   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6079
6080   if (slp_node)
6081     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6082                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6083
6084   tree op0 = ops[1 - reduc_index];
6085
6086   int group_size = 1;
6087   stmt_vec_info scalar_dest_def_info;
6088   auto_vec<tree> vec_oprnds0;
6089   if (slp_node)
6090     {
6091       auto_vec<vec<tree> > vec_defs (2);
6092       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6093       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6094       vec_defs[0].release ();
6095       vec_defs[1].release ();
6096       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6097       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6098     }
6099   else
6100     {
6101       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6102                                      op0, &vec_oprnds0);
6103       scalar_dest_def_info = stmt_info;
6104     }
6105
6106   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6107   tree scalar_type = TREE_TYPE (scalar_dest);
6108   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6109
6110   int vec_num = vec_oprnds0.length ();
6111   gcc_assert (vec_num == 1 || slp_node);
6112   tree vec_elem_type = TREE_TYPE (vectype_out);
6113   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6114
6115   tree vector_identity = NULL_TREE;
6116   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6117     vector_identity = build_zero_cst (vectype_out);
6118
6119   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6120   int i;
6121   tree def0;
6122   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6123     {
6124       gimple *new_stmt;
6125       tree mask = NULL_TREE;
6126       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6127         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6128
6129       /* Handle MINUS by adding the negative.  */
6130       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6131         {
6132           tree negated = make_ssa_name (vectype_out);
6133           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6134           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6135           def0 = negated;
6136         }
6137
6138       if (mask && mask_reduc_fn == IFN_LAST)
6139         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6140                                     vector_identity);
6141
6142       /* On the first iteration the input is simply the scalar phi
6143          result, and for subsequent iterations it is the output of
6144          the preceding operation.  */
6145       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6146         {
6147           if (mask && mask_reduc_fn != IFN_LAST)
6148             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6149                                                    def0, mask);
6150           else
6151             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6152                                                    def0);
6153           /* For chained SLP reductions the output of the previous reduction
6154              operation serves as the input of the next. For the final statement
6155              the output cannot be a temporary - we reuse the original
6156              scalar destination of the last statement.  */
6157           if (i != vec_num - 1)
6158             {
6159               gimple_set_lhs (new_stmt, scalar_dest_var);
6160               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6161               gimple_set_lhs (new_stmt, reduc_var);
6162             }
6163         }
6164       else
6165         {
6166           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6167                                              reduc_var, def0);
6168           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6169           /* Remove the statement, so that we can use the same code paths
6170              as for statements that we've just created.  */
6171           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6172           gsi_remove (&tmp_gsi, true);
6173         }
6174
6175       if (i == vec_num - 1)
6176         {
6177           gimple_set_lhs (new_stmt, scalar_dest);
6178           vect_finish_replace_stmt (loop_vinfo,
6179                                     scalar_dest_def_info,
6180                                     new_stmt);
6181         }
6182       else
6183         vect_finish_stmt_generation (loop_vinfo,
6184                                      scalar_dest_def_info,
6185                                      new_stmt, gsi);
6186
6187       if (slp_node)
6188         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6189       else
6190         {
6191           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6192           *vec_stmt = new_stmt;
6193         }
6194     }
6195
6196   return true;
6197 }
6198
6199 /* Function is_nonwrapping_integer_induction.
6200
6201    Check if STMT_VINO (which is part of loop LOOP) both increments and
6202    does not cause overflow.  */
6203
6204 static bool
6205 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6206 {
6207   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6208   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6209   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6210   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6211   widest_int ni, max_loop_value, lhs_max;
6212   wi::overflow_type overflow = wi::OVF_NONE;
6213
6214   /* Make sure the loop is integer based.  */
6215   if (TREE_CODE (base) != INTEGER_CST
6216       || TREE_CODE (step) != INTEGER_CST)
6217     return false;
6218
6219   /* Check that the max size of the loop will not wrap.  */
6220
6221   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6222     return true;
6223
6224   if (! max_stmt_executions (loop, &ni))
6225     return false;
6226
6227   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6228                             &overflow);
6229   if (overflow)
6230     return false;
6231
6232   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6233                             TYPE_SIGN (lhs_type), &overflow);
6234   if (overflow)
6235     return false;
6236
6237   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6238           <= TYPE_PRECISION (lhs_type));
6239 }
6240
6241 /* Check if masking can be supported by inserting a conditional expression.
6242    CODE is the code for the operation.  COND_FN is the conditional internal
6243    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6244 static bool
6245 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6246                          tree vectype_in)
6247 {
6248   if (cond_fn != IFN_LAST
6249       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6250                                          OPTIMIZE_FOR_SPEED))
6251     return false;
6252
6253   switch (code)
6254     {
6255     case DOT_PROD_EXPR:
6256     case SAD_EXPR:
6257       return true;
6258
6259     default:
6260       return false;
6261     }
6262 }
6263
6264 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6265    code for the operation.  VOP is the array of operands.  MASK is the loop
6266    mask.  GSI is a statement iterator used to place the new conditional
6267    expression.  */
6268 static void
6269 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6270                       gimple_stmt_iterator *gsi)
6271 {
6272   switch (code)
6273     {
6274     case DOT_PROD_EXPR:
6275       {
6276         tree vectype = TREE_TYPE (vop[1]);
6277         tree zero = build_zero_cst (vectype);
6278         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6279         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6280                                                mask, vop[1], zero);
6281         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6282         vop[1] = masked_op1;
6283         break;
6284       }
6285
6286     case SAD_EXPR:
6287       {
6288         tree vectype = TREE_TYPE (vop[1]);
6289         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6290         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6291                                                mask, vop[1], vop[0]);
6292         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6293         vop[1] = masked_op1;
6294         break;
6295       }
6296
6297     default:
6298       gcc_unreachable ();
6299     }
6300 }
6301
6302 /* Function vectorizable_reduction.
6303
6304    Check if STMT_INFO performs a reduction operation that can be vectorized.
6305    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6306    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6307    Return true if STMT_INFO is vectorizable in this way.
6308
6309    This function also handles reduction idioms (patterns) that have been
6310    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6311    may be of this form:
6312      X = pattern_expr (arg0, arg1, ..., X)
6313    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6314    sequence that had been detected and replaced by the pattern-stmt
6315    (STMT_INFO).
6316
6317    This function also handles reduction of condition expressions, for example:
6318      for (int i = 0; i < N; i++)
6319        if (a[i] < value)
6320          last = a[i];
6321    This is handled by vectorising the loop and creating an additional vector
6322    containing the loop indexes for which "a[i] < value" was true.  In the
6323    function epilogue this is reduced to a single max value and then used to
6324    index into the vector of results.
6325
6326    In some cases of reduction patterns, the type of the reduction variable X is
6327    different than the type of the other arguments of STMT_INFO.
6328    In such cases, the vectype that is used when transforming STMT_INFO into
6329    a vector stmt is different than the vectype that is used to determine the
6330    vectorization factor, because it consists of a different number of elements
6331    than the actual number of elements that are being operated upon in parallel.
6332
6333    For example, consider an accumulation of shorts into an int accumulator.
6334    On some targets it's possible to vectorize this pattern operating on 8
6335    shorts at a time (hence, the vectype for purposes of determining the
6336    vectorization factor should be V8HI); on the other hand, the vectype that
6337    is used to create the vector form is actually V4SI (the type of the result).
6338
6339    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6340    indicates what is the actual level of parallelism (V8HI in the example), so
6341    that the right vectorization factor would be derived.  This vectype
6342    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6343    be used to create the vectorized stmt.  The right vectype for the vectorized
6344    stmt is obtained from the type of the result X:
6345       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6346
6347    This means that, contrary to "regular" reductions (or "regular" stmts in
6348    general), the following equation:
6349       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6350    does *NOT* necessarily hold for reduction patterns.  */
6351
6352 bool
6353 vectorizable_reduction (loop_vec_info loop_vinfo,
6354                         stmt_vec_info stmt_info, slp_tree slp_node,
6355                         slp_instance slp_node_instance,
6356                         stmt_vector_for_cost *cost_vec)
6357 {
6358   tree scalar_dest;
6359   tree vectype_in = NULL_TREE;
6360   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6361   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6362   stmt_vec_info cond_stmt_vinfo = NULL;
6363   tree scalar_type;
6364   int i;
6365   int ncopies;
6366   bool single_defuse_cycle = false;
6367   bool nested_cycle = false;
6368   bool double_reduc = false;
6369   int vec_num;
6370   tree tem;
6371   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6372   tree cond_reduc_val = NULL_TREE;
6373
6374   /* Make sure it was already recognized as a reduction computation.  */
6375   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6376       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6377       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6378     return false;
6379
6380   /* The stmt we store reduction analysis meta on.  */
6381   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6382   reduc_info->is_reduc_info = true;
6383
6384   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6385     {
6386       if (is_a <gphi *> (stmt_info->stmt))
6387         {
6388           if (slp_node)
6389             {
6390               /* We eventually need to set a vector type on invariant
6391                  arguments.  */
6392               unsigned j;
6393               slp_tree child;
6394               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6395                 if (!vect_maybe_update_slp_op_vectype
6396                        (child, SLP_TREE_VECTYPE (slp_node)))
6397                   {
6398                     if (dump_enabled_p ())
6399                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6400                                        "incompatible vector types for "
6401                                        "invariants\n");
6402                     return false;
6403                   }
6404             }
6405           /* Analysis for double-reduction is done on the outer
6406              loop PHI, nested cycles have no further restrictions.  */
6407           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6408         }
6409       else
6410         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6411       return true;
6412     }
6413
6414   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6415   stmt_vec_info phi_info = stmt_info;
6416   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6417       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6418     {
6419       if (!is_a <gphi *> (stmt_info->stmt))
6420         {
6421           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6422           return true;
6423         }
6424       if (slp_node)
6425         {
6426           slp_node_instance->reduc_phis = slp_node;
6427           /* ???  We're leaving slp_node to point to the PHIs, we only
6428              need it to get at the number of vector stmts which wasn't
6429              yet initialized for the instance root.  */
6430         }
6431       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6432         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6433       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6434         {
6435           use_operand_p use_p;
6436           gimple *use_stmt;
6437           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6438                                      &use_p, &use_stmt);
6439           gcc_assert (res);
6440           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6441           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6442         }
6443     }
6444
6445   /* PHIs should not participate in patterns.  */
6446   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6447   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6448
6449   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6450      and compute the reduction chain length.  Discover the real
6451      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6452   tree reduc_def
6453     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6454                              loop_latch_edge
6455                                (gimple_bb (reduc_def_phi)->loop_father));
6456   unsigned reduc_chain_length = 0;
6457   bool only_slp_reduc_chain = true;
6458   stmt_info = NULL;
6459   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6460   while (reduc_def != PHI_RESULT (reduc_def_phi))
6461     {
6462       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6463       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6464       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6465         {
6466           if (dump_enabled_p ())
6467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6468                              "reduction chain broken by patterns.\n");
6469           return false;
6470         }
6471       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6472         only_slp_reduc_chain = false;
6473       /* ???  For epilogue generation live members of the chain need
6474          to point back to the PHI via their original stmt for
6475          info_for_reduction to work.  */
6476       if (STMT_VINFO_LIVE_P (vdef))
6477         STMT_VINFO_REDUC_DEF (def) = phi_info;
6478       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6479       if (!assign)
6480         {
6481           if (dump_enabled_p ())
6482             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6483                              "reduction chain includes calls.\n");
6484           return false;
6485         }
6486       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6487         {
6488           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6489                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6490             {
6491               if (dump_enabled_p ())
6492                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493                                  "conversion in the reduction chain.\n");
6494               return false;
6495             }
6496         }
6497       else if (!stmt_info)
6498         /* First non-conversion stmt.  */
6499         stmt_info = vdef;
6500       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6501       reduc_chain_length++;
6502       if (!stmt_info && slp_node)
6503         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6504     }
6505   /* PHIs should not participate in patterns.  */
6506   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6507
6508   if (nested_in_vect_loop_p (loop, stmt_info))
6509     {
6510       loop = loop->inner;
6511       nested_cycle = true;
6512     }
6513
6514   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6515      element.  */
6516   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6517     {
6518       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6519       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6520     }
6521   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6522     gcc_assert (slp_node
6523                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6524
6525   /* 1. Is vectorizable reduction?  */
6526   /* Not supportable if the reduction variable is used in the loop, unless
6527      it's a reduction chain.  */
6528   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6529       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6530     return false;
6531
6532   /* Reductions that are not used even in an enclosing outer-loop,
6533      are expected to be "live" (used out of the loop).  */
6534   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6535       && !STMT_VINFO_LIVE_P (stmt_info))
6536     return false;
6537
6538   /* 2. Has this been recognized as a reduction pattern?
6539
6540      Check if STMT represents a pattern that has been recognized
6541      in earlier analysis stages.  For stmts that represent a pattern,
6542      the STMT_VINFO_RELATED_STMT field records the last stmt in
6543      the original sequence that constitutes the pattern.  */
6544
6545   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6546   if (orig_stmt_info)
6547     {
6548       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6549       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6550     }
6551
6552   /* 3. Check the operands of the operation.  The first operands are defined
6553         inside the loop body. The last operand is the reduction variable,
6554         which is defined by the loop-header-phi.  */
6555
6556   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6557   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6558   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6559   enum tree_code code = gimple_assign_rhs_code (stmt);
6560   bool lane_reduc_code_p
6561     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6562   int op_type = TREE_CODE_LENGTH (code);
6563
6564   scalar_dest = gimple_assign_lhs (stmt);
6565   scalar_type = TREE_TYPE (scalar_dest);
6566   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6567       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6568     return false;
6569
6570   /* Do not try to vectorize bit-precision reductions.  */
6571   if (!type_has_mode_precision_p (scalar_type))
6572     return false;
6573
6574   /* For lane-reducing ops we're reducing the number of reduction PHIs
6575      which means the only use of that may be in the lane-reducing operation.  */
6576   if (lane_reduc_code_p
6577       && reduc_chain_length != 1
6578       && !only_slp_reduc_chain)
6579     {
6580       if (dump_enabled_p ())
6581         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582                          "lane-reducing reduction with extra stmts.\n");
6583       return false;
6584     }
6585
6586   /* All uses but the last are expected to be defined in the loop.
6587      The last use is the reduction variable.  In case of nested cycle this
6588      assumption is not true: we use reduc_index to record the index of the
6589      reduction variable.  */
6590   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6591   /* We need to skip an extra operand for COND_EXPRs with embedded
6592      comparison.  */
6593   unsigned opno_adjust = 0;
6594   if (code == COND_EXPR
6595       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6596     opno_adjust = 1;
6597   for (i = 0; i < op_type; i++)
6598     {
6599       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6600       if (i == 0 && code == COND_EXPR)
6601         continue;
6602
6603       stmt_vec_info def_stmt_info;
6604       enum vect_def_type dt;
6605       tree op;
6606       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6607                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6608                                &def_stmt_info))
6609         {
6610           if (dump_enabled_p ())
6611             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6612                              "use not simple.\n");
6613           return false;
6614         }
6615       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6616         continue;
6617
6618       /* There should be only one cycle def in the stmt, the one
6619          leading to reduc_def.  */
6620       if (VECTORIZABLE_CYCLE_DEF (dt))
6621         return false;
6622
6623       /* To properly compute ncopies we are interested in the widest
6624          non-reduction input type in case we're looking at a widening
6625          accumulation that we later handle in vect_transform_reduction.  */
6626       if (lane_reduc_code_p
6627           && tem
6628           && (!vectype_in
6629               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6630                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6631         vectype_in = tem;
6632
6633       if (code == COND_EXPR)
6634         {
6635           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6636           if (dt == vect_constant_def)
6637             {
6638               cond_reduc_dt = dt;
6639               cond_reduc_val = op;
6640             }
6641           if (dt == vect_induction_def
6642               && def_stmt_info
6643               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6644             {
6645               cond_reduc_dt = dt;
6646               cond_stmt_vinfo = def_stmt_info;
6647             }
6648         }
6649     }
6650   if (!vectype_in)
6651     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6652   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6653
6654   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6655   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6656   /* If we have a condition reduction, see if we can simplify it further.  */
6657   if (v_reduc_type == COND_REDUCTION)
6658     {
6659       if (slp_node)
6660         return false;
6661
6662       /* When the condition uses the reduction value in the condition, fail.  */
6663       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6664         {
6665           if (dump_enabled_p ())
6666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6667                              "condition depends on previous iteration\n");
6668           return false;
6669         }
6670
6671       if (reduc_chain_length == 1
6672           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6673                                              vectype_in, OPTIMIZE_FOR_SPEED))
6674         {
6675           if (dump_enabled_p ())
6676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6677                              "optimizing condition reduction with"
6678                              " FOLD_EXTRACT_LAST.\n");
6679           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6680         }
6681       else if (cond_reduc_dt == vect_induction_def)
6682         {
6683           tree base
6684             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6685           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6686
6687           gcc_assert (TREE_CODE (base) == INTEGER_CST
6688                       && TREE_CODE (step) == INTEGER_CST);
6689           cond_reduc_val = NULL_TREE;
6690           enum tree_code cond_reduc_op_code = ERROR_MARK;
6691           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6692           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6693             ;
6694           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6695              above base; punt if base is the minimum value of the type for
6696              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6697           else if (tree_int_cst_sgn (step) == -1)
6698             {
6699               cond_reduc_op_code = MIN_EXPR;
6700               if (tree_int_cst_sgn (base) == -1)
6701                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6702               else if (tree_int_cst_lt (base,
6703                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6704                 cond_reduc_val
6705                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6706             }
6707           else
6708             {
6709               cond_reduc_op_code = MAX_EXPR;
6710               if (tree_int_cst_sgn (base) == 1)
6711                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6712               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6713                                         base))
6714                 cond_reduc_val
6715                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6716             }
6717           if (cond_reduc_val)
6718             {
6719               if (dump_enabled_p ())
6720                 dump_printf_loc (MSG_NOTE, vect_location,
6721                                  "condition expression based on "
6722                                  "integer induction.\n");
6723               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6724               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6725                 = cond_reduc_val;
6726               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6727             }
6728         }
6729       else if (cond_reduc_dt == vect_constant_def)
6730         {
6731           enum vect_def_type cond_initial_dt;
6732           tree cond_initial_val
6733             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6734
6735           gcc_assert (cond_reduc_val != NULL_TREE);
6736           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6737           if (cond_initial_dt == vect_constant_def
6738               && types_compatible_p (TREE_TYPE (cond_initial_val),
6739                                      TREE_TYPE (cond_reduc_val)))
6740             {
6741               tree e = fold_binary (LE_EXPR, boolean_type_node,
6742                                     cond_initial_val, cond_reduc_val);
6743               if (e && (integer_onep (e) || integer_zerop (e)))
6744                 {
6745                   if (dump_enabled_p ())
6746                     dump_printf_loc (MSG_NOTE, vect_location,
6747                                      "condition expression based on "
6748                                      "compile time constant.\n");
6749                   /* Record reduction code at analysis stage.  */
6750                   STMT_VINFO_REDUC_CODE (reduc_info)
6751                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6752                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6753                 }
6754             }
6755         }
6756     }
6757
6758   if (STMT_VINFO_LIVE_P (phi_info))
6759     return false;
6760
6761   if (slp_node)
6762     ncopies = 1;
6763   else
6764     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6765
6766   gcc_assert (ncopies >= 1);
6767
6768   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6769
6770   if (nested_cycle)
6771     {
6772       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6773                   == vect_double_reduction_def);
6774       double_reduc = true;
6775     }
6776
6777   /* 4.2. Check support for the epilog operation.
6778
6779           If STMT represents a reduction pattern, then the type of the
6780           reduction variable may be different than the type of the rest
6781           of the arguments.  For example, consider the case of accumulation
6782           of shorts into an int accumulator; The original code:
6783                         S1: int_a = (int) short_a;
6784           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6785
6786           was replaced with:
6787                         STMT: int_acc = widen_sum <short_a, int_acc>
6788
6789           This means that:
6790           1. The tree-code that is used to create the vector operation in the
6791              epilog code (that reduces the partial results) is not the
6792              tree-code of STMT, but is rather the tree-code of the original
6793              stmt from the pattern that STMT is replacing.  I.e, in the example
6794              above we want to use 'widen_sum' in the loop, but 'plus' in the
6795              epilog.
6796           2. The type (mode) we use to check available target support
6797              for the vector operation to be created in the *epilog*, is
6798              determined by the type of the reduction variable (in the example
6799              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6800              However the type (mode) we use to check available target support
6801              for the vector operation to be created *inside the loop*, is
6802              determined by the type of the other arguments to STMT (in the
6803              example we'd check this: optab_handler (widen_sum_optab,
6804              vect_short_mode)).
6805
6806           This is contrary to "regular" reductions, in which the types of all
6807           the arguments are the same as the type of the reduction variable.
6808           For "regular" reductions we can therefore use the same vector type
6809           (and also the same tree-code) when generating the epilog code and
6810           when generating the code inside the loop.  */
6811
6812   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6813   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6814
6815   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6816   if (reduction_type == TREE_CODE_REDUCTION)
6817     {
6818       /* Check whether it's ok to change the order of the computation.
6819          Generally, when vectorizing a reduction we change the order of the
6820          computation.  This may change the behavior of the program in some
6821          cases, so we need to check that this is ok.  One exception is when
6822          vectorizing an outer-loop: the inner-loop is executed sequentially,
6823          and therefore vectorizing reductions in the inner-loop during
6824          outer-loop vectorization is safe.  */
6825       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6826         {
6827           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6828              is not directy used in stmt.  */
6829           if (!only_slp_reduc_chain
6830               && reduc_chain_length != 1)
6831             {
6832               if (dump_enabled_p ())
6833                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834                                  "in-order reduction chain without SLP.\n");
6835               return false;
6836             }
6837           STMT_VINFO_REDUC_TYPE (reduc_info)
6838             = reduction_type = FOLD_LEFT_REDUCTION;
6839         }
6840       else if (!commutative_tree_code (orig_code)
6841                || !associative_tree_code (orig_code))
6842         {
6843           if (dump_enabled_p ())
6844             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845                             "reduction: not commutative/associative");
6846           return false;
6847         }
6848     }
6849
6850   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6851       && ncopies > 1)
6852     {
6853       if (dump_enabled_p ())
6854         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6855                          "multiple types in double reduction or condition "
6856                          "reduction or fold-left reduction.\n");
6857       return false;
6858     }
6859
6860   internal_fn reduc_fn = IFN_LAST;
6861   if (reduction_type == TREE_CODE_REDUCTION
6862       || reduction_type == FOLD_LEFT_REDUCTION
6863       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6864       || reduction_type == CONST_COND_REDUCTION)
6865     {
6866       if (reduction_type == FOLD_LEFT_REDUCTION
6867           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6868           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6869         {
6870           if (reduc_fn != IFN_LAST
6871               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6872                                                   OPTIMIZE_FOR_SPEED))
6873             {
6874               if (dump_enabled_p ())
6875                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6876                                  "reduc op not supported by target.\n");
6877
6878               reduc_fn = IFN_LAST;
6879             }
6880         }
6881       else
6882         {
6883           if (!nested_cycle || double_reduc)
6884             {
6885               if (dump_enabled_p ())
6886                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6887                                  "no reduc code for scalar code.\n");
6888
6889               return false;
6890             }
6891         }
6892     }
6893   else if (reduction_type == COND_REDUCTION)
6894     {
6895       int scalar_precision
6896         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6897       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6898       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6899                                                 nunits_out);
6900
6901       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6902                                           OPTIMIZE_FOR_SPEED))
6903         reduc_fn = IFN_REDUC_MAX;
6904     }
6905   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6906
6907   if (reduction_type != EXTRACT_LAST_REDUCTION
6908       && (!nested_cycle || double_reduc)
6909       && reduc_fn == IFN_LAST
6910       && !nunits_out.is_constant ())
6911     {
6912       if (dump_enabled_p ())
6913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6914                          "missing target support for reduction on"
6915                          " variable-length vectors.\n");
6916       return false;
6917     }
6918
6919   /* For SLP reductions, see if there is a neutral value we can use.  */
6920   tree neutral_op = NULL_TREE;
6921   if (slp_node)
6922     neutral_op = neutral_op_for_slp_reduction
6923       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6924        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6925
6926   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6927     {
6928       /* We can't support in-order reductions of code such as this:
6929
6930            for (int i = 0; i < n1; ++i)
6931              for (int j = 0; j < n2; ++j)
6932                l += a[j];
6933
6934          since GCC effectively transforms the loop when vectorizing:
6935
6936            for (int i = 0; i < n1 / VF; ++i)
6937              for (int j = 0; j < n2; ++j)
6938                for (int k = 0; k < VF; ++k)
6939                  l += a[j];
6940
6941          which is a reassociation of the original operation.  */
6942       if (dump_enabled_p ())
6943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6944                          "in-order double reduction not supported.\n");
6945
6946       return false;
6947     }
6948
6949   if (reduction_type == FOLD_LEFT_REDUCTION
6950       && slp_node
6951       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6952     {
6953       /* We cannot use in-order reductions in this case because there is
6954          an implicit reassociation of the operations involved.  */
6955       if (dump_enabled_p ())
6956         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6957                          "in-order unchained SLP reductions not supported.\n");
6958       return false;
6959     }
6960
6961   /* For double reductions, and for SLP reductions with a neutral value,
6962      we construct a variable-length initial vector by loading a vector
6963      full of the neutral value and then shift-and-inserting the start
6964      values into the low-numbered elements.  */
6965   if ((double_reduc || neutral_op)
6966       && !nunits_out.is_constant ()
6967       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6968                                           vectype_out, OPTIMIZE_FOR_SPEED))
6969     {
6970       if (dump_enabled_p ())
6971         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6972                          "reduction on variable-length vectors requires"
6973                          " target support for a vector-shift-and-insert"
6974                          " operation.\n");
6975       return false;
6976     }
6977
6978   /* Check extra constraints for variable-length unchained SLP reductions.  */
6979   if (STMT_SLP_TYPE (stmt_info)
6980       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6981       && !nunits_out.is_constant ())
6982     {
6983       /* We checked above that we could build the initial vector when
6984          there's a neutral element value.  Check here for the case in
6985          which each SLP statement has its own initial value and in which
6986          that value needs to be repeated for every instance of the
6987          statement within the initial vector.  */
6988       unsigned int group_size = SLP_TREE_LANES (slp_node);
6989       if (!neutral_op
6990           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6991                                               TREE_TYPE (vectype_out)))
6992         {
6993           if (dump_enabled_p ())
6994             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995                              "unsupported form of SLP reduction for"
6996                              " variable-length vectors: cannot build"
6997                              " initial vector.\n");
6998           return false;
6999         }
7000       /* The epilogue code relies on the number of elements being a multiple
7001          of the group size.  The duplicate-and-interleave approach to setting
7002          up the initial vector does too.  */
7003       if (!multiple_p (nunits_out, group_size))
7004         {
7005           if (dump_enabled_p ())
7006             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7007                              "unsupported form of SLP reduction for"
7008                              " variable-length vectors: the vector size"
7009                              " is not a multiple of the number of results.\n");
7010           return false;
7011         }
7012     }
7013
7014   if (reduction_type == COND_REDUCTION)
7015     {
7016       widest_int ni;
7017
7018       if (! max_loop_iterations (loop, &ni))
7019         {
7020           if (dump_enabled_p ())
7021             dump_printf_loc (MSG_NOTE, vect_location,
7022                              "loop count not known, cannot create cond "
7023                              "reduction.\n");
7024           return false;
7025         }
7026       /* Convert backedges to iterations.  */
7027       ni += 1;
7028
7029       /* The additional index will be the same type as the condition.  Check
7030          that the loop can fit into this less one (because we'll use up the
7031          zero slot for when there are no matches).  */
7032       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7033       if (wi::geu_p (ni, wi::to_widest (max_index)))
7034         {
7035           if (dump_enabled_p ())
7036             dump_printf_loc (MSG_NOTE, vect_location,
7037                              "loop size is greater than data size.\n");
7038           return false;
7039         }
7040     }
7041
7042   /* In case the vectorization factor (VF) is bigger than the number
7043      of elements that we can fit in a vectype (nunits), we have to generate
7044      more than one vector stmt - i.e - we need to "unroll" the
7045      vector stmt by a factor VF/nunits.  For more details see documentation
7046      in vectorizable_operation.  */
7047
7048   /* If the reduction is used in an outer loop we need to generate
7049      VF intermediate results, like so (e.g. for ncopies=2):
7050         r0 = phi (init, r0)
7051         r1 = phi (init, r1)
7052         r0 = x0 + r0;
7053         r1 = x1 + r1;
7054     (i.e. we generate VF results in 2 registers).
7055     In this case we have a separate def-use cycle for each copy, and therefore
7056     for each copy we get the vector def for the reduction variable from the
7057     respective phi node created for this copy.
7058
7059     Otherwise (the reduction is unused in the loop nest), we can combine
7060     together intermediate results, like so (e.g. for ncopies=2):
7061         r = phi (init, r)
7062         r = x0 + r;
7063         r = x1 + r;
7064    (i.e. we generate VF/2 results in a single register).
7065    In this case for each copy we get the vector def for the reduction variable
7066    from the vectorized reduction operation generated in the previous iteration.
7067
7068    This only works when we see both the reduction PHI and its only consumer
7069    in vectorizable_reduction and there are no intermediate stmts
7070    participating.  */
7071   if (ncopies > 1
7072       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7073       && reduc_chain_length == 1)
7074     single_defuse_cycle = true;
7075
7076   if (single_defuse_cycle || lane_reduc_code_p)
7077     {
7078       gcc_assert (code != COND_EXPR);
7079
7080       /* 4. Supportable by target?  */
7081       bool ok = true;
7082
7083       /* 4.1. check support for the operation in the loop  */
7084       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7085       if (!optab)
7086         {
7087           if (dump_enabled_p ())
7088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7089                              "no optab.\n");
7090           ok = false;
7091         }
7092
7093       machine_mode vec_mode = TYPE_MODE (vectype_in);
7094       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7095         {
7096           if (dump_enabled_p ())
7097             dump_printf (MSG_NOTE, "op not supported by target.\n");
7098           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7099               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7100             ok = false;
7101           else
7102             if (dump_enabled_p ())
7103               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7104         }
7105
7106       /* Worthwhile without SIMD support?  */
7107       if (ok
7108           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7109           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7110         {
7111           if (dump_enabled_p ())
7112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7113                              "not worthwhile without SIMD support.\n");
7114           ok = false;
7115         }
7116
7117       /* lane-reducing operations have to go through vect_transform_reduction.
7118          For the other cases try without the single cycle optimization.  */
7119       if (!ok)
7120         {
7121           if (lane_reduc_code_p)
7122             return false;
7123           else
7124             single_defuse_cycle = false;
7125         }
7126     }
7127   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7128
7129   /* If the reduction stmt is one of the patterns that have lane
7130      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7131   if ((ncopies > 1 && ! single_defuse_cycle)
7132       && lane_reduc_code_p)
7133     {
7134       if (dump_enabled_p ())
7135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7136                          "multi def-use cycle not possible for lane-reducing "
7137                          "reduction operation\n");
7138       return false;
7139     }
7140
7141   if (slp_node
7142       && !(!single_defuse_cycle
7143            && code != DOT_PROD_EXPR
7144            && code != WIDEN_SUM_EXPR
7145            && code != SAD_EXPR
7146            && reduction_type != FOLD_LEFT_REDUCTION))
7147     for (i = 0; i < op_type; i++)
7148       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7149         {
7150           if (dump_enabled_p ())
7151             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7152                              "incompatible vector types for invariants\n");
7153           return false;
7154         }
7155
7156   if (slp_node)
7157     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7158   else
7159     vec_num = 1;
7160
7161   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7162                              reduction_type, ncopies, cost_vec);
7163   if (dump_enabled_p ()
7164       && reduction_type == FOLD_LEFT_REDUCTION)
7165     dump_printf_loc (MSG_NOTE, vect_location,
7166                      "using an in-order (fold-left) reduction.\n");
7167   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7168   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7169      reductions go through their own vectorizable_* routines.  */
7170   if (!single_defuse_cycle
7171       && code != DOT_PROD_EXPR
7172       && code != WIDEN_SUM_EXPR
7173       && code != SAD_EXPR
7174       && reduction_type != FOLD_LEFT_REDUCTION)
7175     {
7176       stmt_vec_info tem
7177         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7178       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7179         {
7180           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7181           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7182         }
7183       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7184       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7185     }
7186   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7187     {
7188       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7189       internal_fn cond_fn = get_conditional_internal_fn (code);
7190
7191       if (reduction_type != FOLD_LEFT_REDUCTION
7192           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7193           && (cond_fn == IFN_LAST
7194               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7195                                                   OPTIMIZE_FOR_SPEED)))
7196         {
7197           if (dump_enabled_p ())
7198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7199                              "can't operate on partial vectors because"
7200                              " no conditional operation is available.\n");
7201           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7202         }
7203       else if (reduction_type == FOLD_LEFT_REDUCTION
7204                && reduc_fn == IFN_LAST
7205                && !expand_vec_cond_expr_p (vectype_in,
7206                                            truth_type_for (vectype_in),
7207                                            SSA_NAME))
7208         {
7209           if (dump_enabled_p ())
7210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7211                              "can't operate on partial vectors because"
7212                              " no conditional operation is available.\n");
7213           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7214         }
7215       else
7216         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7217                                vectype_in, NULL);
7218     }
7219   return true;
7220 }
7221
7222 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7223    value.  */
7224
7225 bool
7226 vect_transform_reduction (loop_vec_info loop_vinfo,
7227                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7228                           gimple **vec_stmt, slp_tree slp_node)
7229 {
7230   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7231   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7232   int i;
7233   int ncopies;
7234   int vec_num;
7235
7236   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7237   gcc_assert (reduc_info->is_reduc_info);
7238
7239   if (nested_in_vect_loop_p (loop, stmt_info))
7240     {
7241       loop = loop->inner;
7242       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7243     }
7244
7245   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7246   enum tree_code code = gimple_assign_rhs_code (stmt);
7247   int op_type = TREE_CODE_LENGTH (code);
7248
7249   /* Flatten RHS.  */
7250   tree ops[3];
7251   switch (get_gimple_rhs_class (code))
7252     {
7253     case GIMPLE_TERNARY_RHS:
7254       ops[2] = gimple_assign_rhs3 (stmt);
7255       /* Fall thru.  */
7256     case GIMPLE_BINARY_RHS:
7257       ops[0] = gimple_assign_rhs1 (stmt);
7258       ops[1] = gimple_assign_rhs2 (stmt);
7259       break;
7260     default:
7261       gcc_unreachable ();
7262     }
7263
7264   /* All uses but the last are expected to be defined in the loop.
7265      The last use is the reduction variable.  In case of nested cycle this
7266      assumption is not true: we use reduc_index to record the index of the
7267      reduction variable.  */
7268   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7269   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7270   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7271   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7272
7273   if (slp_node)
7274     {
7275       ncopies = 1;
7276       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7277     }
7278   else
7279     {
7280       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7281       vec_num = 1;
7282     }
7283
7284   internal_fn cond_fn = get_conditional_internal_fn (code);
7285   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7286   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7287
7288   /* Transform.  */
7289   tree new_temp = NULL_TREE;
7290   auto_vec<tree> vec_oprnds0;
7291   auto_vec<tree> vec_oprnds1;
7292   auto_vec<tree> vec_oprnds2;
7293   tree def0;
7294
7295   if (dump_enabled_p ())
7296     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7297
7298   /* FORNOW: Multiple types are not supported for condition.  */
7299   if (code == COND_EXPR)
7300     gcc_assert (ncopies == 1);
7301
7302   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7303
7304   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7305   if (reduction_type == FOLD_LEFT_REDUCTION)
7306     {
7307       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7308       return vectorize_fold_left_reduction
7309           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7310            reduc_fn, ops, vectype_in, reduc_index, masks);
7311     }
7312
7313   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7314   gcc_assert (single_defuse_cycle
7315               || code == DOT_PROD_EXPR
7316               || code == WIDEN_SUM_EXPR
7317               || code == SAD_EXPR);
7318
7319   /* Create the destination vector  */
7320   tree scalar_dest = gimple_assign_lhs (stmt);
7321   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7322
7323   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7324                      single_defuse_cycle && reduc_index == 0
7325                      ? NULL_TREE : ops[0], &vec_oprnds0,
7326                      single_defuse_cycle && reduc_index == 1
7327                      ? NULL_TREE : ops[1], &vec_oprnds1,
7328                      op_type == ternary_op
7329                      && !(single_defuse_cycle && reduc_index == 2)
7330                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7331   if (single_defuse_cycle)
7332     {
7333       gcc_assert (!slp_node);
7334       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7335                                      ops[reduc_index],
7336                                      reduc_index == 0 ? &vec_oprnds0
7337                                      : (reduc_index == 1 ? &vec_oprnds1
7338                                         : &vec_oprnds2));
7339     }
7340
7341   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7342     {
7343       gimple *new_stmt;
7344       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7345       if (masked_loop_p && !mask_by_cond_expr)
7346         {
7347           /* Make sure that the reduction accumulator is vop[0].  */
7348           if (reduc_index == 1)
7349             {
7350               gcc_assert (commutative_tree_code (code));
7351               std::swap (vop[0], vop[1]);
7352             }
7353           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7354                                           vectype_in, i);
7355           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7356                                                     vop[0], vop[1], vop[0]);
7357           new_temp = make_ssa_name (vec_dest, call);
7358           gimple_call_set_lhs (call, new_temp);
7359           gimple_call_set_nothrow (call, true);
7360           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7361           new_stmt = call;
7362         }
7363       else
7364         {
7365           if (op_type == ternary_op)
7366             vop[2] = vec_oprnds2[i];
7367
7368           if (masked_loop_p && mask_by_cond_expr)
7369             {
7370               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7371                                               vectype_in, i);
7372               build_vect_cond_expr (code, vop, mask, gsi);
7373             }
7374
7375           new_stmt = gimple_build_assign (vec_dest, code,
7376                                           vop[0], vop[1], vop[2]);
7377           new_temp = make_ssa_name (vec_dest, new_stmt);
7378           gimple_assign_set_lhs (new_stmt, new_temp);
7379           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7380         }
7381
7382       if (slp_node)
7383         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7384       else if (single_defuse_cycle
7385                && i < ncopies - 1)
7386         {
7387           if (reduc_index == 0)
7388             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7389           else if (reduc_index == 1)
7390             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7391           else if (reduc_index == 2)
7392             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7393         }
7394       else
7395         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7396     }
7397
7398   if (!slp_node)
7399     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7400
7401   return true;
7402 }
7403
7404 /* Transform phase of a cycle PHI.  */
7405
7406 bool
7407 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7408                           stmt_vec_info stmt_info, gimple **vec_stmt,
7409                           slp_tree slp_node, slp_instance slp_node_instance)
7410 {
7411   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7412   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7413   int i;
7414   int ncopies;
7415   int j;
7416   bool nested_cycle = false;
7417   int vec_num;
7418
7419   if (nested_in_vect_loop_p (loop, stmt_info))
7420     {
7421       loop = loop->inner;
7422       nested_cycle = true;
7423     }
7424
7425   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7426   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7427   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7428   gcc_assert (reduc_info->is_reduc_info);
7429
7430   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7431       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7432     /* Leave the scalar phi in place.  */
7433     return true;
7434
7435   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7436   /* For a nested cycle we do not fill the above.  */
7437   if (!vectype_in)
7438     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7439   gcc_assert (vectype_in);
7440
7441   if (slp_node)
7442     {
7443       /* The size vect_schedule_slp_instance computes is off for us.  */
7444       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7445                                       * SLP_TREE_LANES (slp_node), vectype_in);
7446       ncopies = 1;
7447     }
7448   else
7449     {
7450       vec_num = 1;
7451       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7452     }
7453
7454   /* Check whether we should use a single PHI node and accumulate
7455      vectors to one before the backedge.  */
7456   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7457     ncopies = 1;
7458
7459   /* Create the destination vector  */
7460   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7461   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7462                                                vectype_out);
7463
7464   /* Get the loop-entry arguments.  */
7465   tree vec_initial_def;
7466   auto_vec<tree> vec_initial_defs;
7467   if (slp_node)
7468     {
7469       vec_initial_defs.reserve (vec_num);
7470       if (nested_cycle)
7471         {
7472           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7473           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7474                              &vec_initial_defs);
7475         }
7476       else
7477         {
7478           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7479           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7480           tree neutral_op
7481               = neutral_op_for_slp_reduction (slp_node, vectype_out,
7482                                               STMT_VINFO_REDUC_CODE (reduc_info),
7483                                               first != NULL);
7484           get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7485                                           &vec_initial_defs, vec_num,
7486                                           first != NULL, neutral_op);
7487         }
7488     }
7489   else
7490     {
7491       /* Get at the scalar def before the loop, that defines the initial
7492          value of the reduction variable.  */
7493       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7494                                                 loop_preheader_edge (loop));
7495       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7496          and we can't use zero for induc_val, use initial_def.  Similarly
7497          for REDUC_MIN and initial_def larger than the base.  */
7498       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7499         {
7500           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7501           if (TREE_CODE (initial_def) == INTEGER_CST
7502               && !integer_zerop (induc_val)
7503               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7504                    && tree_int_cst_lt (initial_def, induc_val))
7505                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7506                       && tree_int_cst_lt (induc_val, initial_def))))
7507             {
7508               induc_val = initial_def;
7509               /* Communicate we used the initial_def to epilouge
7510                  generation.  */
7511               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7512             }
7513           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7514           vec_initial_defs.create (ncopies);
7515           for (i = 0; i < ncopies; ++i)
7516             vec_initial_defs.quick_push (vec_initial_def);
7517         }
7518       else if (nested_cycle)
7519         {
7520           /* Do not use an adjustment def as that case is not supported
7521              correctly if ncopies is not one.  */
7522           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7523                                          ncopies, initial_def,
7524                                          &vec_initial_defs);
7525         }
7526       else
7527         {
7528           tree adjustment_def = NULL_TREE;
7529           tree *adjustment_defp = &adjustment_def;
7530           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7531           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7532             adjustment_defp = NULL;
7533           vec_initial_def
7534             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7535                                              initial_def, adjustment_defp);
7536           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7537           vec_initial_defs.create (ncopies);
7538           for (i = 0; i < ncopies; ++i)
7539             vec_initial_defs.quick_push (vec_initial_def);
7540         }
7541     }
7542
7543   /* Generate the reduction PHIs upfront.  */
7544   for (i = 0; i < vec_num; i++)
7545     {
7546       tree vec_init_def = vec_initial_defs[i];
7547       for (j = 0; j < ncopies; j++)
7548         {
7549           /* Create the reduction-phi that defines the reduction
7550              operand.  */
7551           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7552
7553           /* Set the loop-entry arg of the reduction-phi.  */
7554           if (j != 0 && nested_cycle)
7555             vec_init_def = vec_initial_defs[j];
7556           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7557                        UNKNOWN_LOCATION);
7558
7559           /* The loop-latch arg is set in epilogue processing.  */
7560
7561           if (slp_node)
7562             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7563           else
7564             {
7565               if (j == 0)
7566                 *vec_stmt = new_phi;
7567               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7568             }
7569         }
7570     }
7571
7572   return true;
7573 }
7574
7575 /* Vectorizes LC PHIs.  */
7576
7577 bool
7578 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7579                      stmt_vec_info stmt_info, gimple **vec_stmt,
7580                      slp_tree slp_node)
7581 {
7582   if (!loop_vinfo
7583       || !is_a <gphi *> (stmt_info->stmt)
7584       || gimple_phi_num_args (stmt_info->stmt) != 1)
7585     return false;
7586
7587   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7588       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7589     return false;
7590
7591   if (!vec_stmt) /* transformation not required.  */
7592     {
7593       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7594       return true;
7595     }
7596
7597   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7598   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7599   basic_block bb = gimple_bb (stmt_info->stmt);
7600   edge e = single_pred_edge (bb);
7601   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7602   auto_vec<tree> vec_oprnds;
7603   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7604                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7605                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7606   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7607     {
7608       /* Create the vectorized LC PHI node.  */
7609       gphi *new_phi = create_phi_node (vec_dest, bb);
7610       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7611       if (slp_node)
7612         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7613       else
7614         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7615     }
7616   if (!slp_node)
7617     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7618
7619   return true;
7620 }
7621
7622 /* Vectorizes PHIs.  */
7623
7624 bool
7625 vectorizable_phi (vec_info *,
7626                   stmt_vec_info stmt_info, gimple **vec_stmt,
7627                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7628 {
7629   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7630     return false;
7631
7632   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7633     return false;
7634
7635   tree vectype = SLP_TREE_VECTYPE (slp_node);
7636
7637   if (!vec_stmt) /* transformation not required.  */
7638     {
7639       slp_tree child;
7640       unsigned i;
7641       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7642         if (!child)
7643           {
7644             if (dump_enabled_p ())
7645               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646                                "PHI node with unvectorized backedge def\n");
7647             return false;
7648           }
7649         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7650           {
7651             if (dump_enabled_p ())
7652               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7653                                "incompatible vector types for invariants\n");
7654             return false;
7655           }
7656       record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7657                         vector_stmt, stmt_info, vectype, 0, vect_body);
7658       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7659       return true;
7660     }
7661
7662   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7663   basic_block bb = gimple_bb (stmt_info->stmt);
7664   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7665   auto_vec<gphi *> new_phis;
7666   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7667     {
7668       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7669
7670       /* Skip not yet vectorized defs.  */
7671       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7672           && SLP_TREE_VEC_STMTS (child).is_empty ())
7673         continue;
7674
7675       auto_vec<tree> vec_oprnds;
7676       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7677       if (!new_phis.exists ())
7678         {
7679           new_phis.create (vec_oprnds.length ());
7680           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7681             {
7682               /* Create the vectorized LC PHI node.  */
7683               new_phis.quick_push (create_phi_node (vec_dest, bb));
7684               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7685             }
7686         }
7687       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7688       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7689         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7690     }
7691   /* We should have at least one already vectorized child.  */
7692   gcc_assert (new_phis.exists ());
7693
7694   return true;
7695 }
7696
7697
7698 /* Function vect_min_worthwhile_factor.
7699
7700    For a loop where we could vectorize the operation indicated by CODE,
7701    return the minimum vectorization factor that makes it worthwhile
7702    to use generic vectors.  */
7703 static unsigned int
7704 vect_min_worthwhile_factor (enum tree_code code)
7705 {
7706   switch (code)
7707     {
7708     case PLUS_EXPR:
7709     case MINUS_EXPR:
7710     case NEGATE_EXPR:
7711       return 4;
7712
7713     case BIT_AND_EXPR:
7714     case BIT_IOR_EXPR:
7715     case BIT_XOR_EXPR:
7716     case BIT_NOT_EXPR:
7717       return 2;
7718
7719     default:
7720       return INT_MAX;
7721     }
7722 }
7723
7724 /* Return true if VINFO indicates we are doing loop vectorization and if
7725    it is worth decomposing CODE operations into scalar operations for
7726    that loop's vectorization factor.  */
7727
7728 bool
7729 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7730 {
7731   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7732   unsigned HOST_WIDE_INT value;
7733   return (loop_vinfo
7734           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7735           && value >= vect_min_worthwhile_factor (code));
7736 }
7737
7738 /* Function vectorizable_induction
7739
7740    Check if STMT_INFO performs an induction computation that can be vectorized.
7741    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7742    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7743    Return true if STMT_INFO is vectorizable in this way.  */
7744
7745 bool
7746 vectorizable_induction (loop_vec_info loop_vinfo,
7747                         stmt_vec_info stmt_info,
7748                         gimple **vec_stmt, slp_tree slp_node,
7749                         stmt_vector_for_cost *cost_vec)
7750 {
7751   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7752   unsigned ncopies;
7753   bool nested_in_vect_loop = false;
7754   class loop *iv_loop;
7755   tree vec_def;
7756   edge pe = loop_preheader_edge (loop);
7757   basic_block new_bb;
7758   tree new_vec, vec_init, vec_step, t;
7759   tree new_name;
7760   gimple *new_stmt;
7761   gphi *induction_phi;
7762   tree induc_def, vec_dest;
7763   tree init_expr, step_expr;
7764   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7765   unsigned i;
7766   tree expr;
7767   gimple_stmt_iterator si;
7768
7769   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7770   if (!phi)
7771     return false;
7772
7773   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7774     return false;
7775
7776   /* Make sure it was recognized as induction computation.  */
7777   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7778     return false;
7779
7780   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7781   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7782
7783   if (slp_node)
7784     ncopies = 1;
7785   else
7786     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7787   gcc_assert (ncopies >= 1);
7788
7789   /* FORNOW. These restrictions should be relaxed.  */
7790   if (nested_in_vect_loop_p (loop, stmt_info))
7791     {
7792       imm_use_iterator imm_iter;
7793       use_operand_p use_p;
7794       gimple *exit_phi;
7795       edge latch_e;
7796       tree loop_arg;
7797
7798       if (ncopies > 1)
7799         {
7800           if (dump_enabled_p ())
7801             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7802                              "multiple types in nested loop.\n");
7803           return false;
7804         }
7805
7806       exit_phi = NULL;
7807       latch_e = loop_latch_edge (loop->inner);
7808       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7809       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7810         {
7811           gimple *use_stmt = USE_STMT (use_p);
7812           if (is_gimple_debug (use_stmt))
7813             continue;
7814
7815           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7816             {
7817               exit_phi = use_stmt;
7818               break;
7819             }
7820         }
7821       if (exit_phi)
7822         {
7823           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7824           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7825                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7826             {
7827               if (dump_enabled_p ())
7828                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7829                                  "inner-loop induction only used outside "
7830                                  "of the outer vectorized loop.\n");
7831               return false;
7832             }
7833         }
7834
7835       nested_in_vect_loop = true;
7836       iv_loop = loop->inner;
7837     }
7838   else
7839     iv_loop = loop;
7840   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7841
7842   if (slp_node && !nunits.is_constant ())
7843     {
7844       /* The current SLP code creates the step value element-by-element.  */
7845       if (dump_enabled_p ())
7846         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7847                          "SLP induction not supported for variable-length"
7848                          " vectors.\n");
7849       return false;
7850     }
7851
7852   if (!vec_stmt) /* transformation not required.  */
7853     {
7854       unsigned inside_cost = 0, prologue_cost = 0;
7855       if (slp_node)
7856         {
7857           /* We eventually need to set a vector type on invariant
7858              arguments.  */
7859           unsigned j;
7860           slp_tree child;
7861           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7862             if (!vect_maybe_update_slp_op_vectype
7863                 (child, SLP_TREE_VECTYPE (slp_node)))
7864               {
7865                 if (dump_enabled_p ())
7866                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867                                    "incompatible vector types for "
7868                                    "invariants\n");
7869                 return false;
7870               }
7871           /* loop cost for vec_loop.  */
7872           inside_cost
7873             = record_stmt_cost (cost_vec,
7874                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7875                                 vector_stmt, stmt_info, 0, vect_body);
7876           /* prologue cost for vec_init (if not nested) and step.  */
7877           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7878                                             scalar_to_vec,
7879                                             stmt_info, 0, vect_prologue);
7880         }
7881       else /* if (!slp_node) */
7882         {
7883           /* loop cost for vec_loop.  */
7884           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7885                                           stmt_info, 0, vect_body);
7886           /* prologue cost for vec_init and vec_step.  */
7887           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7888                                             stmt_info, 0, vect_prologue);
7889         }
7890       if (dump_enabled_p ())
7891         dump_printf_loc (MSG_NOTE, vect_location,
7892                          "vect_model_induction_cost: inside_cost = %d, "
7893                          "prologue_cost = %d .\n", inside_cost,
7894                          prologue_cost);
7895
7896       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7897       DUMP_VECT_SCOPE ("vectorizable_induction");
7898       return true;
7899     }
7900
7901   /* Transform.  */
7902
7903   /* Compute a vector variable, initialized with the first VF values of
7904      the induction variable.  E.g., for an iv with IV_PHI='X' and
7905      evolution S, for a vector of 4 units, we want to compute:
7906      [X, X + S, X + 2*S, X + 3*S].  */
7907
7908   if (dump_enabled_p ())
7909     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7910
7911   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7912   gcc_assert (step_expr != NULL_TREE);
7913   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7914
7915   pe = loop_preheader_edge (iv_loop);
7916   /* Find the first insertion point in the BB.  */
7917   basic_block bb = gimple_bb (phi);
7918   si = gsi_after_labels (bb);
7919
7920   /* For SLP induction we have to generate several IVs as for example
7921      with group size 3 we need
7922        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7923        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
7924   if (slp_node)
7925     {
7926       /* Enforced above.  */
7927       unsigned int const_nunits = nunits.to_constant ();
7928
7929       /* The initial values are vectorized, but any lanes > group_size
7930          need adjustment.  */
7931       slp_tree init_node
7932         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7933
7934       /* Gather steps.  Since we do not vectorize inductions as
7935          cycles we have to reconstruct the step from SCEV data.  */
7936       unsigned group_size = SLP_TREE_LANES (slp_node);
7937       tree *steps = XALLOCAVEC (tree, group_size);
7938       tree *inits = XALLOCAVEC (tree, group_size);
7939       stmt_vec_info phi_info;
7940       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7941         {
7942           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7943           if (!init_node)
7944             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7945                                            pe->dest_idx);
7946         }
7947
7948       /* Now generate the IVs.  */
7949       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7950       gcc_assert ((const_nunits * nvects) % group_size == 0);
7951       unsigned nivs;
7952       if (nested_in_vect_loop)
7953         nivs = nvects;
7954       else
7955         {
7956           /* Compute the number of distinct IVs we need.  First reduce
7957              group_size if it is a multiple of const_nunits so we get
7958              one IV for a group_size of 4 but const_nunits 2.  */
7959           unsigned group_sizep = group_size;
7960           if (group_sizep % const_nunits == 0)
7961             group_sizep = group_sizep / const_nunits;
7962           nivs = least_common_multiple (group_sizep,
7963                                         const_nunits) / const_nunits;
7964         }
7965       tree stept = TREE_TYPE (step_vectype);
7966       tree lupdate_mul = NULL_TREE;
7967       if (!nested_in_vect_loop)
7968         {
7969           /* The number of iterations covered in one vector iteration.  */
7970           unsigned lup_mul = (nvects * const_nunits) / group_size;
7971           lupdate_mul
7972             = build_vector_from_val (step_vectype,
7973                                      SCALAR_FLOAT_TYPE_P (stept)
7974                                      ? build_real_from_wide (stept, lup_mul,
7975                                                              UNSIGNED)
7976                                      : build_int_cstu (stept, lup_mul));
7977         }
7978       tree peel_mul = NULL_TREE;
7979       gimple_seq init_stmts = NULL;
7980       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
7981         {
7982           if (SCALAR_FLOAT_TYPE_P (stept))
7983             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
7984                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7985           else
7986             peel_mul = gimple_convert (&init_stmts, stept,
7987                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7988           peel_mul = gimple_build_vector_from_val (&init_stmts,
7989                                                    step_vectype, peel_mul);
7990         }
7991       unsigned ivn;
7992       auto_vec<tree> vec_steps;
7993       for (ivn = 0; ivn < nivs; ++ivn)
7994         {
7995           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
7996           tree_vector_builder init_elts (vectype, const_nunits, 1);
7997           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
7998           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7999             {
8000               /* The scalar steps of the IVs.  */
8001               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8002               step_elts.quick_push (elt);
8003               if (!init_node)
8004                 {
8005                   /* The scalar inits of the IVs if not vectorized.  */
8006                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8007                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8008                                                   TREE_TYPE (elt)))
8009                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8010                                         TREE_TYPE (vectype), elt);
8011                   init_elts.quick_push (elt);
8012                 }
8013               /* The number of steps to add to the initial values.  */
8014               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8015               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8016                                    ? build_real_from_wide (stept,
8017                                                            mul_elt, UNSIGNED)
8018                                    : build_int_cstu (stept, mul_elt));
8019             }
8020           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8021           vec_step = gimple_convert (&init_stmts, step_vectype, vec_step);
8022           vec_steps.safe_push (vec_step);
8023           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8024           if (peel_mul)
8025             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8026                                      step_mul, peel_mul);
8027           if (!init_node)
8028             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8029
8030           /* Create the induction-phi that defines the induction-operand.  */
8031           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8032                                             "vec_iv_");
8033           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8034           induc_def = PHI_RESULT (induction_phi);
8035
8036           /* Create the iv update inside the loop  */
8037           tree up = vec_step;
8038           if (lupdate_mul)
8039             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8040                                vec_step, lupdate_mul);
8041           gimple_seq stmts = NULL;
8042           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8043           vec_def = gimple_build (&stmts,
8044                                   PLUS_EXPR, step_vectype, vec_def, up);
8045           vec_def = gimple_convert (&stmts, vectype, vec_def);
8046           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8047           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8048                        UNKNOWN_LOCATION);
8049
8050           if (init_node)
8051             vec_init = vect_get_slp_vect_def (init_node, ivn);
8052           if (!nested_in_vect_loop
8053               && !integer_zerop (step_mul))
8054             {
8055               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8056               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8057                                  vec_step, step_mul);
8058               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8059                                       vec_def, up);
8060               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8061             }
8062
8063           /* Set the arguments of the phi node:  */
8064           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8065
8066           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8067         }
8068       if (!nested_in_vect_loop)
8069         {
8070           /* Fill up to the number of vectors we need for the whole group.  */
8071           nivs = least_common_multiple (group_size,
8072                                         const_nunits) / const_nunits;
8073           for (; ivn < nivs; ++ivn)
8074             {
8075               SLP_TREE_VEC_STMTS (slp_node)
8076                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8077               vec_steps.safe_push (vec_steps[0]);
8078             }
8079         }
8080
8081       /* Re-use IVs when we can.  We are generating further vector
8082          stmts by adding VF' * stride to the IVs generated above.  */
8083       if (ivn < nvects)
8084         {
8085           unsigned vfp
8086             = least_common_multiple (group_size, const_nunits) / group_size;
8087           tree lupdate_mul
8088             = build_vector_from_val (step_vectype,
8089                                      SCALAR_FLOAT_TYPE_P (stept)
8090                                      ? build_real_from_wide (stept,
8091                                                              vfp, UNSIGNED)
8092                                      : build_int_cstu (stept, vfp));
8093           for (; ivn < nvects; ++ivn)
8094             {
8095               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8096               tree def = gimple_get_lhs (iv);
8097               if (ivn < 2*nivs)
8098                 vec_steps[ivn - nivs]
8099                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8100                                   vec_steps[ivn - nivs], lupdate_mul);
8101               gimple_seq stmts = NULL;
8102               def = gimple_convert (&stmts, step_vectype, def);
8103               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8104                                   def, vec_steps[ivn % nivs]);
8105               def = gimple_convert (&stmts, vectype, def);
8106               if (gimple_code (iv) == GIMPLE_PHI)
8107                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8108               else
8109                 {
8110                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8111                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8112                 }
8113               SLP_TREE_VEC_STMTS (slp_node)
8114                 .quick_push (SSA_NAME_DEF_STMT (def));
8115             }
8116         }
8117
8118       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8119       gcc_assert (!new_bb);
8120
8121       return true;
8122     }
8123
8124   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8125                                      loop_preheader_edge (iv_loop));
8126
8127   gimple_seq stmts = NULL;
8128   if (!nested_in_vect_loop)
8129     {
8130       /* Convert the initial value to the IV update type.  */
8131       tree new_type = TREE_TYPE (step_expr);
8132       init_expr = gimple_convert (&stmts, new_type, init_expr);
8133
8134       /* If we are using the loop mask to "peel" for alignment then we need
8135          to adjust the start value here.  */
8136       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8137       if (skip_niters != NULL_TREE)
8138         {
8139           if (FLOAT_TYPE_P (vectype))
8140             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8141                                         skip_niters);
8142           else
8143             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8144           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8145                                          skip_niters, step_expr);
8146           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8147                                     init_expr, skip_step);
8148         }
8149     }
8150
8151   if (stmts)
8152     {
8153       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8154       gcc_assert (!new_bb);
8155     }
8156
8157   /* Create the vector that holds the initial_value of the induction.  */
8158   if (nested_in_vect_loop)
8159     {
8160       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8161          been created during vectorization of previous stmts.  We obtain it
8162          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8163       auto_vec<tree> vec_inits;
8164       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8165                                      init_expr, &vec_inits);
8166       vec_init = vec_inits[0];
8167       /* If the initial value is not of proper type, convert it.  */
8168       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8169         {
8170           new_stmt
8171             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8172                                                           vect_simple_var,
8173                                                           "vec_iv_"),
8174                                    VIEW_CONVERT_EXPR,
8175                                    build1 (VIEW_CONVERT_EXPR, vectype,
8176                                            vec_init));
8177           vec_init = gimple_assign_lhs (new_stmt);
8178           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8179                                                  new_stmt);
8180           gcc_assert (!new_bb);
8181         }
8182     }
8183   else
8184     {
8185       /* iv_loop is the loop to be vectorized. Create:
8186          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8187       stmts = NULL;
8188       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8189
8190       unsigned HOST_WIDE_INT const_nunits;
8191       if (nunits.is_constant (&const_nunits))
8192         {
8193           tree_vector_builder elts (step_vectype, const_nunits, 1);
8194           elts.quick_push (new_name);
8195           for (i = 1; i < const_nunits; i++)
8196             {
8197               /* Create: new_name_i = new_name + step_expr  */
8198               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8199                                        new_name, step_expr);
8200               elts.quick_push (new_name);
8201             }
8202           /* Create a vector from [new_name_0, new_name_1, ...,
8203              new_name_nunits-1]  */
8204           vec_init = gimple_build_vector (&stmts, &elts);
8205         }
8206       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8207         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8208         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8209                                  new_name, step_expr);
8210       else
8211         {
8212           /* Build:
8213                 [base, base, base, ...]
8214                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8215           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8216           gcc_assert (flag_associative_math);
8217           tree index = build_index_vector (step_vectype, 0, 1);
8218           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8219                                                         new_name);
8220           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8221                                                         step_expr);
8222           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8223           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8224                                    vec_init, step_vec);
8225           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8226                                    vec_init, base_vec);
8227         }
8228       vec_init = gimple_convert (&stmts, vectype, vec_init);
8229
8230       if (stmts)
8231         {
8232           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8233           gcc_assert (!new_bb);
8234         }
8235     }
8236
8237
8238   /* Create the vector that holds the step of the induction.  */
8239   if (nested_in_vect_loop)
8240     /* iv_loop is nested in the loop to be vectorized. Generate:
8241        vec_step = [S, S, S, S]  */
8242     new_name = step_expr;
8243   else
8244     {
8245       /* iv_loop is the loop to be vectorized. Generate:
8246           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8247       gimple_seq seq = NULL;
8248       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8249         {
8250           expr = build_int_cst (integer_type_node, vf);
8251           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8252         }
8253       else
8254         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8255       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8256                                expr, step_expr);
8257       if (seq)
8258         {
8259           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8260           gcc_assert (!new_bb);
8261         }
8262     }
8263
8264   t = unshare_expr (new_name);
8265   gcc_assert (CONSTANT_CLASS_P (new_name)
8266               || TREE_CODE (new_name) == SSA_NAME);
8267   new_vec = build_vector_from_val (step_vectype, t);
8268   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8269                                new_vec, step_vectype, NULL);
8270
8271
8272   /* Create the following def-use cycle:
8273      loop prolog:
8274          vec_init = ...
8275          vec_step = ...
8276      loop:
8277          vec_iv = PHI <vec_init, vec_loop>
8278          ...
8279          STMT
8280          ...
8281          vec_loop = vec_iv + vec_step;  */
8282
8283   /* Create the induction-phi that defines the induction-operand.  */
8284   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8285   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8286   induc_def = PHI_RESULT (induction_phi);
8287
8288   /* Create the iv update inside the loop  */
8289   stmts = NULL;
8290   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8291   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8292   vec_def = gimple_convert (&stmts, vectype, vec_def);
8293   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8294   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8295
8296   /* Set the arguments of the phi node:  */
8297   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8298   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8299                UNKNOWN_LOCATION);
8300
8301   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8302   *vec_stmt = induction_phi;
8303
8304   /* In case that vectorization factor (VF) is bigger than the number
8305      of elements that we can fit in a vectype (nunits), we have to generate
8306      more than one vector stmt - i.e - we need to "unroll" the
8307      vector stmt by a factor VF/nunits.  For more details see documentation
8308      in vectorizable_operation.  */
8309
8310   if (ncopies > 1)
8311     {
8312       gimple_seq seq = NULL;
8313       /* FORNOW. This restriction should be relaxed.  */
8314       gcc_assert (!nested_in_vect_loop);
8315
8316       /* Create the vector that holds the step of the induction.  */
8317       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8318         {
8319           expr = build_int_cst (integer_type_node, nunits);
8320           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8321         }
8322       else
8323         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8324       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8325                                expr, step_expr);
8326       if (seq)
8327         {
8328           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8329           gcc_assert (!new_bb);
8330         }
8331
8332       t = unshare_expr (new_name);
8333       gcc_assert (CONSTANT_CLASS_P (new_name)
8334                   || TREE_CODE (new_name) == SSA_NAME);
8335       new_vec = build_vector_from_val (step_vectype, t);
8336       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8337                                    new_vec, step_vectype, NULL);
8338
8339       vec_def = induc_def;
8340       for (i = 1; i < ncopies; i++)
8341         {
8342           /* vec_i = vec_prev + vec_step  */
8343           gimple_seq stmts = NULL;
8344           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8345           vec_def = gimple_build (&stmts,
8346                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8347           vec_def = gimple_convert (&stmts, vectype, vec_def);
8348
8349           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8350           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8351           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8352         }
8353     }
8354
8355   if (dump_enabled_p ())
8356     dump_printf_loc (MSG_NOTE, vect_location,
8357                      "transform induction: created def-use cycle: %G%G",
8358                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8359
8360   return true;
8361 }
8362
8363 /* Function vectorizable_live_operation.
8364
8365    STMT_INFO computes a value that is used outside the loop.  Check if
8366    it can be supported.  */
8367
8368 bool
8369 vectorizable_live_operation (vec_info *vinfo,
8370                              stmt_vec_info stmt_info,
8371                              gimple_stmt_iterator *gsi,
8372                              slp_tree slp_node, slp_instance slp_node_instance,
8373                              int slp_index, bool vec_stmt_p,
8374                              stmt_vector_for_cost *cost_vec)
8375 {
8376   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8377   imm_use_iterator imm_iter;
8378   tree lhs, lhs_type, bitsize, vec_bitsize;
8379   tree vectype = (slp_node
8380                   ? SLP_TREE_VECTYPE (slp_node)
8381                   : STMT_VINFO_VECTYPE (stmt_info));
8382   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8383   int ncopies;
8384   gimple *use_stmt;
8385   auto_vec<tree> vec_oprnds;
8386   int vec_entry = 0;
8387   poly_uint64 vec_index = 0;
8388
8389   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8390
8391   /* If a stmt of a reduction is live, vectorize it via
8392      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8393      validity so just trigger the transform here.  */
8394   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8395     {
8396       if (!vec_stmt_p)
8397         return true;
8398       if (slp_node)
8399         {
8400           /* For reduction chains the meta-info is attached to
8401              the group leader.  */
8402           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8403             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8404           /* For SLP reductions we vectorize the epilogue for
8405              all involved stmts together.  */
8406           else if (slp_index != 0)
8407             return true;
8408           else
8409             /* For SLP reductions the meta-info is attached to
8410                the representative.  */
8411             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8412         }
8413       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8414       gcc_assert (reduc_info->is_reduc_info);
8415       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8416           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8417         return true;
8418       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8419                                         slp_node_instance);
8420       return true;
8421     }
8422
8423   /* If STMT is not relevant and it is a simple assignment and its inputs are
8424      invariant then it can remain in place, unvectorized.  The original last
8425      scalar value that it computes will be used.  */
8426   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8427     {
8428       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8429       if (dump_enabled_p ())
8430         dump_printf_loc (MSG_NOTE, vect_location,
8431                          "statement is simple and uses invariant.  Leaving in "
8432                          "place.\n");
8433       return true;
8434     }
8435
8436   if (slp_node)
8437     ncopies = 1;
8438   else
8439     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8440
8441   if (slp_node)
8442     {
8443       gcc_assert (slp_index >= 0);
8444
8445       /* Get the last occurrence of the scalar index from the concatenation of
8446          all the slp vectors. Calculate which slp vector it is and the index
8447          within.  */
8448       int num_scalar = SLP_TREE_LANES (slp_node);
8449       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8450       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8451
8452       /* Calculate which vector contains the result, and which lane of
8453          that vector we need.  */
8454       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8455         {
8456           if (dump_enabled_p ())
8457             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8458                              "Cannot determine which vector holds the"
8459                              " final result.\n");
8460           return false;
8461         }
8462     }
8463
8464   if (!vec_stmt_p)
8465     {
8466       /* No transformation required.  */
8467       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8468         {
8469           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8470                                                OPTIMIZE_FOR_SPEED))
8471             {
8472               if (dump_enabled_p ())
8473                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8474                                  "can't operate on partial vectors "
8475                                  "because the target doesn't support extract "
8476                                  "last reduction.\n");
8477               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8478             }
8479           else if (slp_node)
8480             {
8481               if (dump_enabled_p ())
8482                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8483                                  "can't operate on partial vectors "
8484                                  "because an SLP statement is live after "
8485                                  "the loop.\n");
8486               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8487             }
8488           else if (ncopies > 1)
8489             {
8490               if (dump_enabled_p ())
8491                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8492                                  "can't operate on partial vectors "
8493                                  "because ncopies is greater than 1.\n");
8494               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8495             }
8496           else
8497             {
8498               gcc_assert (ncopies == 1 && !slp_node);
8499               vect_record_loop_mask (loop_vinfo,
8500                                      &LOOP_VINFO_MASKS (loop_vinfo),
8501                                      1, vectype, NULL);
8502             }
8503         }
8504       /* ???  Enable for loop costing as well.  */
8505       if (!loop_vinfo)
8506         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8507                           0, vect_epilogue);
8508       return true;
8509     }
8510
8511   /* Use the lhs of the original scalar statement.  */
8512   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8513   if (dump_enabled_p ())
8514     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8515                      "stmt %G", stmt);
8516
8517   lhs = gimple_get_lhs (stmt);
8518   lhs_type = TREE_TYPE (lhs);
8519
8520   bitsize = vector_element_bits_tree (vectype);
8521   vec_bitsize = TYPE_SIZE (vectype);
8522
8523   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8524   tree vec_lhs, bitstart;
8525   gimple *vec_stmt;
8526   if (slp_node)
8527     {
8528       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8529
8530       /* Get the correct slp vectorized stmt.  */
8531       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8532       vec_lhs = gimple_get_lhs (vec_stmt);
8533
8534       /* Get entry to use.  */
8535       bitstart = bitsize_int (vec_index);
8536       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8537     }
8538   else
8539     {
8540       /* For multiple copies, get the last copy.  */
8541       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8542       vec_lhs = gimple_get_lhs (vec_stmt);
8543
8544       /* Get the last lane in the vector.  */
8545       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8546     }
8547
8548   if (loop_vinfo)
8549     {
8550       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8551          requirement, insert one phi node for it.  It looks like:
8552            loop;
8553          BB:
8554            # lhs' = PHI <lhs>
8555          ==>
8556            loop;
8557          BB:
8558            # vec_lhs' = PHI <vec_lhs>
8559            new_tree = lane_extract <vec_lhs', ...>;
8560            lhs' = new_tree;  */
8561
8562       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8563       basic_block exit_bb = single_exit (loop)->dest;
8564       gcc_assert (single_pred_p (exit_bb));
8565
8566       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8567       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8568       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8569
8570       gimple_seq stmts = NULL;
8571       tree new_tree;
8572       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8573         {
8574           /* Emit:
8575
8576                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8577
8578              where VEC_LHS is the vectorized live-out result and MASK is
8579              the loop mask for the final iteration.  */
8580           gcc_assert (ncopies == 1 && !slp_node);
8581           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8582           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8583                                           1, vectype, 0);
8584           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8585                                           mask, vec_lhs_phi);
8586
8587           /* Convert the extracted vector element to the scalar type.  */
8588           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8589         }
8590       else
8591         {
8592           tree bftype = TREE_TYPE (vectype);
8593           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8594             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8595           new_tree = build3 (BIT_FIELD_REF, bftype,
8596                              vec_lhs_phi, bitsize, bitstart);
8597           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8598                                            &stmts, true, NULL_TREE);
8599         }
8600
8601       if (stmts)
8602         {
8603           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8604           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8605
8606           /* Remove existing phi from lhs and create one copy from new_tree.  */
8607           tree lhs_phi = NULL_TREE;
8608           gimple_stmt_iterator gsi;
8609           for (gsi = gsi_start_phis (exit_bb);
8610                !gsi_end_p (gsi); gsi_next (&gsi))
8611             {
8612               gimple *phi = gsi_stmt (gsi);
8613               if ((gimple_phi_arg_def (phi, 0) == lhs))
8614                 {
8615                   remove_phi_node (&gsi, false);
8616                   lhs_phi = gimple_phi_result (phi);
8617                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8618                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8619                   break;
8620                 }
8621             }
8622         }
8623
8624       /* Replace use of lhs with newly computed result.  If the use stmt is a
8625          single arg PHI, just replace all uses of PHI result.  It's necessary
8626          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8627       use_operand_p use_p;
8628       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8629         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8630             && !is_gimple_debug (use_stmt))
8631           {
8632             if (gimple_code (use_stmt) == GIMPLE_PHI
8633                 && gimple_phi_num_args (use_stmt) == 1)
8634               {
8635                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8636               }
8637             else
8638               {
8639                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8640                     SET_USE (use_p, new_tree);
8641               }
8642             update_stmt (use_stmt);
8643           }
8644     }
8645   else
8646     {
8647       /* For basic-block vectorization simply insert the lane-extraction.  */
8648       tree bftype = TREE_TYPE (vectype);
8649       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8650         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8651       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8652                               vec_lhs, bitsize, bitstart);
8653       gimple_seq stmts = NULL;
8654       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8655                                        &stmts, true, NULL_TREE);
8656       if (TREE_CODE (new_tree) == SSA_NAME
8657           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8658         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8659       if (is_a <gphi *> (vec_stmt))
8660         {
8661           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8662           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8663         }
8664       else
8665         {
8666           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8667           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8668         }
8669
8670       /* Replace use of lhs with newly computed result.  If the use stmt is a
8671          single arg PHI, just replace all uses of PHI result.  It's necessary
8672          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8673       use_operand_p use_p;
8674       stmt_vec_info use_stmt_info;
8675       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8676         if (!is_gimple_debug (use_stmt)
8677             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8678                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8679           {
8680             /* ???  This can happen when the live lane ends up being
8681                used in a vector construction code-generated by an
8682                external SLP node (and code-generation for that already
8683                happened).  See gcc.dg/vect/bb-slp-47.c.
8684                Doing this is what would happen if that vector CTOR
8685                were not code-generated yet so it is not too bad.
8686                ???  In fact we'd likely want to avoid this situation
8687                in the first place.  */
8688             if (TREE_CODE (new_tree) == SSA_NAME
8689                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8690                 && gimple_code (use_stmt) != GIMPLE_PHI
8691                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8692                                                 use_stmt))
8693               {
8694                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8695                 gcc_assert (code == CONSTRUCTOR
8696                             || code == VIEW_CONVERT_EXPR
8697                             || CONVERT_EXPR_CODE_P (code));
8698                 if (dump_enabled_p ())
8699                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8700                                    "Using original scalar computation for "
8701                                    "live lane because use preceeds vector "
8702                                    "def\n");
8703                 continue;
8704               }
8705             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8706               SET_USE (use_p, new_tree);
8707             update_stmt (use_stmt);
8708           }
8709     }
8710
8711   return true;
8712 }
8713
8714 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8715
8716 static void
8717 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8718 {
8719   ssa_op_iter op_iter;
8720   imm_use_iterator imm_iter;
8721   def_operand_p def_p;
8722   gimple *ustmt;
8723
8724   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8725     {
8726       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8727         {
8728           basic_block bb;
8729
8730           if (!is_gimple_debug (ustmt))
8731             continue;
8732
8733           bb = gimple_bb (ustmt);
8734
8735           if (!flow_bb_inside_loop_p (loop, bb))
8736             {
8737               if (gimple_debug_bind_p (ustmt))
8738                 {
8739                   if (dump_enabled_p ())
8740                     dump_printf_loc (MSG_NOTE, vect_location,
8741                                      "killing debug use\n");
8742
8743                   gimple_debug_bind_reset_value (ustmt);
8744                   update_stmt (ustmt);
8745                 }
8746               else
8747                 gcc_unreachable ();
8748             }
8749         }
8750     }
8751 }
8752
8753 /* Given loop represented by LOOP_VINFO, return true if computation of
8754    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8755    otherwise.  */
8756
8757 static bool
8758 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8759 {
8760   /* Constant case.  */
8761   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8762     {
8763       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8764       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8765
8766       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8767       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8768       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8769         return true;
8770     }
8771
8772   widest_int max;
8773   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8774   /* Check the upper bound of loop niters.  */
8775   if (get_max_loop_iterations (loop, &max))
8776     {
8777       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8778       signop sgn = TYPE_SIGN (type);
8779       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8780       if (max < type_max)
8781         return true;
8782     }
8783   return false;
8784 }
8785
8786 /* Return a mask type with half the number of elements as OLD_TYPE,
8787    given that it should have mode NEW_MODE.  */
8788
8789 tree
8790 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8791 {
8792   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8793   return build_truth_vector_type_for_mode (nunits, new_mode);
8794 }
8795
8796 /* Return a mask type with twice as many elements as OLD_TYPE,
8797    given that it should have mode NEW_MODE.  */
8798
8799 tree
8800 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8801 {
8802   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8803   return build_truth_vector_type_for_mode (nunits, new_mode);
8804 }
8805
8806 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8807    contain a sequence of NVECTORS masks that each control a vector of type
8808    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8809    these vector masks with the vector version of SCALAR_MASK.  */
8810
8811 void
8812 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8813                        unsigned int nvectors, tree vectype, tree scalar_mask)
8814 {
8815   gcc_assert (nvectors != 0);
8816   if (masks->length () < nvectors)
8817     masks->safe_grow_cleared (nvectors, true);
8818   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8819   /* The number of scalars per iteration and the number of vectors are
8820      both compile-time constants.  */
8821   unsigned int nscalars_per_iter
8822     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8823                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8824
8825   if (scalar_mask)
8826     {
8827       scalar_cond_masked_key cond (scalar_mask, nvectors);
8828       loop_vinfo->scalar_cond_masked_set.add (cond);
8829     }
8830
8831   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8832     {
8833       rgm->max_nscalars_per_iter = nscalars_per_iter;
8834       rgm->type = truth_type_for (vectype);
8835       rgm->factor = 1;
8836     }
8837 }
8838
8839 /* Given a complete set of masks MASKS, extract mask number INDEX
8840    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8841    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8842
8843    See the comment above vec_loop_masks for more details about the mask
8844    arrangement.  */
8845
8846 tree
8847 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8848                     unsigned int nvectors, tree vectype, unsigned int index)
8849 {
8850   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8851   tree mask_type = rgm->type;
8852
8853   /* Populate the rgroup's mask array, if this is the first time we've
8854      used it.  */
8855   if (rgm->controls.is_empty ())
8856     {
8857       rgm->controls.safe_grow_cleared (nvectors, true);
8858       for (unsigned int i = 0; i < nvectors; ++i)
8859         {
8860           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8861           /* Provide a dummy definition until the real one is available.  */
8862           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8863           rgm->controls[i] = mask;
8864         }
8865     }
8866
8867   tree mask = rgm->controls[index];
8868   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8869                 TYPE_VECTOR_SUBPARTS (vectype)))
8870     {
8871       /* A loop mask for data type X can be reused for data type Y
8872          if X has N times more elements than Y and if Y's elements
8873          are N times bigger than X's.  In this case each sequence
8874          of N elements in the loop mask will be all-zero or all-one.
8875          We can then view-convert the mask so that each sequence of
8876          N elements is replaced by a single element.  */
8877       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8878                               TYPE_VECTOR_SUBPARTS (vectype)));
8879       gimple_seq seq = NULL;
8880       mask_type = truth_type_for (vectype);
8881       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8882       if (seq)
8883         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8884     }
8885   return mask;
8886 }
8887
8888 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8889    lengths for controlling an operation on VECTYPE.  The operation splits
8890    each element of VECTYPE into FACTOR separate subelements, measuring the
8891    length as a number of these subelements.  */
8892
8893 void
8894 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8895                       unsigned int nvectors, tree vectype, unsigned int factor)
8896 {
8897   gcc_assert (nvectors != 0);
8898   if (lens->length () < nvectors)
8899     lens->safe_grow_cleared (nvectors, true);
8900   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8901
8902   /* The number of scalars per iteration, scalar occupied bytes and
8903      the number of vectors are both compile-time constants.  */
8904   unsigned int nscalars_per_iter
8905     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8906                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8907
8908   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8909     {
8910       /* For now, we only support cases in which all loads and stores fall back
8911          to VnQI or none do.  */
8912       gcc_assert (!rgl->max_nscalars_per_iter
8913                   || (rgl->factor == 1 && factor == 1)
8914                   || (rgl->max_nscalars_per_iter * rgl->factor
8915                       == nscalars_per_iter * factor));
8916       rgl->max_nscalars_per_iter = nscalars_per_iter;
8917       rgl->type = vectype;
8918       rgl->factor = factor;
8919     }
8920 }
8921
8922 /* Given a complete set of length LENS, extract length number INDEX for an
8923    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
8924
8925 tree
8926 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8927                    unsigned int nvectors, unsigned int index)
8928 {
8929   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8930
8931   /* Populate the rgroup's len array, if this is the first time we've
8932      used it.  */
8933   if (rgl->controls.is_empty ())
8934     {
8935       rgl->controls.safe_grow_cleared (nvectors, true);
8936       for (unsigned int i = 0; i < nvectors; ++i)
8937         {
8938           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8939           gcc_assert (len_type != NULL_TREE);
8940           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8941
8942           /* Provide a dummy definition until the real one is available.  */
8943           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8944           rgl->controls[i] = len;
8945         }
8946     }
8947
8948   return rgl->controls[index];
8949 }
8950
8951 /* Scale profiling counters by estimation for LOOP which is vectorized
8952    by factor VF.  */
8953
8954 static void
8955 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8956 {
8957   edge preheader = loop_preheader_edge (loop);
8958   /* Reduce loop iterations by the vectorization factor.  */
8959   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8960   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8961
8962   if (freq_h.nonzero_p ())
8963     {
8964       profile_probability p;
8965
8966       /* Avoid dropping loop body profile counter to 0 because of zero count
8967          in loop's preheader.  */
8968       if (!(freq_e == profile_count::zero ()))
8969         freq_e = freq_e.force_nonzero ();
8970       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8971       scale_loop_frequencies (loop, p);
8972     }
8973
8974   edge exit_e = single_exit (loop);
8975   exit_e->probability = profile_probability::always ()
8976                                  .apply_scale (1, new_est_niter + 1);
8977
8978   edge exit_l = single_pred_edge (loop->latch);
8979   profile_probability prob = exit_l->probability;
8980   exit_l->probability = exit_e->probability.invert ();
8981   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8982     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8983 }
8984
8985 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8986    latch edge values originally defined by it.  */
8987
8988 static void
8989 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8990                                      stmt_vec_info def_stmt_info)
8991 {
8992   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8993   if (!def || TREE_CODE (def) != SSA_NAME)
8994     return;
8995   stmt_vec_info phi_info;
8996   imm_use_iterator iter;
8997   use_operand_p use_p;
8998   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8999     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9000       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9001           && (phi_info = loop_vinfo->lookup_stmt (phi))
9002           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9003           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9004           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9005         {
9006           loop_p loop = gimple_bb (phi)->loop_father;
9007           edge e = loop_latch_edge (loop);
9008           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9009             {
9010               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9011               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9012               gcc_assert (phi_defs.length () == latch_defs.length ());
9013               for (unsigned i = 0; i < phi_defs.length (); ++i)
9014                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9015                              gimple_get_lhs (latch_defs[i]), e,
9016                              gimple_phi_arg_location (phi, e->dest_idx));
9017             }
9018         }
9019 }
9020
9021 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9022    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9023    stmt_vec_info.  */
9024
9025 static void
9026 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9027                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9028 {
9029   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9030   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9031
9032   if (dump_enabled_p ())
9033     dump_printf_loc (MSG_NOTE, vect_location,
9034                      "------>vectorizing statement: %G", stmt_info->stmt);
9035
9036   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9037     vect_loop_kill_debug_uses (loop, stmt_info);
9038
9039   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9040       && !STMT_VINFO_LIVE_P (stmt_info))
9041     return;
9042
9043   if (STMT_VINFO_VECTYPE (stmt_info))
9044     {
9045       poly_uint64 nunits
9046         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9047       if (!STMT_SLP_TYPE (stmt_info)
9048           && maybe_ne (nunits, vf)
9049           && dump_enabled_p ())
9050         /* For SLP VF is set according to unrolling factor, and not
9051            to vector size, hence for SLP this print is not valid.  */
9052         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9053     }
9054
9055   /* Pure SLP statements have already been vectorized.  We still need
9056      to apply loop vectorization to hybrid SLP statements.  */
9057   if (PURE_SLP_STMT (stmt_info))
9058     return;
9059
9060   if (dump_enabled_p ())
9061     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9062
9063   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9064     *seen_store = stmt_info;
9065 }
9066
9067 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9068    in the hash_map with its corresponding values.  */
9069
9070 static tree
9071 find_in_mapping (tree t, void *context)
9072 {
9073   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9074
9075   tree *value = mapping->get (t);
9076   return value ? *value : t;
9077 }
9078
9079 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9080    original loop that has now been vectorized.
9081
9082    The inits of the data_references need to be advanced with the number of
9083    iterations of the main loop.  This has been computed in vect_do_peeling and
9084    is stored in parameter ADVANCE.  We first restore the data_references
9085    initial offset with the values recored in ORIG_DRS_INIT.
9086
9087    Since the loop_vec_info of this EPILOGUE was constructed for the original
9088    loop, its stmt_vec_infos all point to the original statements.  These need
9089    to be updated to point to their corresponding copies as well as the SSA_NAMES
9090    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9091
9092    The data_reference's connections also need to be updated.  Their
9093    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9094    stmt_vec_infos, their statements need to point to their corresponding copy,
9095    if they are gather loads or scatter stores then their reference needs to be
9096    updated to point to its corresponding copy and finally we set
9097    'base_misaligned' to false as we have already peeled for alignment in the
9098    prologue of the main loop.  */
9099
9100 static void
9101 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9102 {
9103   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9104   auto_vec<gimple *> stmt_worklist;
9105   hash_map<tree,tree> mapping;
9106   gimple *orig_stmt, *new_stmt;
9107   gimple_stmt_iterator epilogue_gsi;
9108   gphi_iterator epilogue_phi_gsi;
9109   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9110   basic_block *epilogue_bbs = get_loop_body (epilogue);
9111   unsigned i;
9112
9113   free (LOOP_VINFO_BBS (epilogue_vinfo));
9114   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9115
9116   /* Advance data_reference's with the number of iterations of the previous
9117      loop and its prologue.  */
9118   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9119
9120
9121   /* The EPILOGUE loop is a copy of the original loop so they share the same
9122      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9123      point to the copied statements.  We also create a mapping of all LHS' in
9124      the original loop and all the LHS' in the EPILOGUE and create worklists to
9125      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9126   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9127     {
9128       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9129            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9130         {
9131           new_stmt = epilogue_phi_gsi.phi ();
9132
9133           gcc_assert (gimple_uid (new_stmt) > 0);
9134           stmt_vinfo
9135             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9136
9137           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9138           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9139
9140           mapping.put (gimple_phi_result (orig_stmt),
9141                        gimple_phi_result (new_stmt));
9142           /* PHI nodes can not have patterns or related statements.  */
9143           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9144                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9145         }
9146
9147       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9148            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9149         {
9150           new_stmt = gsi_stmt (epilogue_gsi);
9151           if (is_gimple_debug (new_stmt))
9152             continue;
9153
9154           gcc_assert (gimple_uid (new_stmt) > 0);
9155           stmt_vinfo
9156             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9157
9158           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9159           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9160
9161           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9162             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9163
9164           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9165             {
9166               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9167               for (gimple_stmt_iterator gsi = gsi_start (seq);
9168                    !gsi_end_p (gsi); gsi_next (&gsi))
9169                 stmt_worklist.safe_push (gsi_stmt (gsi));
9170             }
9171
9172           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9173           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9174             {
9175               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9176               stmt_worklist.safe_push (stmt);
9177               /* Set BB such that the assert in
9178                 'get_initial_def_for_reduction' is able to determine that
9179                 the BB of the related stmt is inside this loop.  */
9180               gimple_set_bb (stmt,
9181                              gimple_bb (new_stmt));
9182               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9183               gcc_assert (related_vinfo == NULL
9184                           || related_vinfo == stmt_vinfo);
9185             }
9186         }
9187     }
9188
9189   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9190      using the original main loop and thus need to be updated to refer to the
9191      cloned variables used in the epilogue.  */
9192   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9193     {
9194       gimple *stmt = stmt_worklist[i];
9195       tree *new_op;
9196
9197       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9198         {
9199           tree op = gimple_op (stmt, j);
9200           if ((new_op = mapping.get(op)))
9201             gimple_set_op (stmt, j, *new_op);
9202           else
9203             {
9204               /* PR92429: The last argument of simplify_replace_tree disables
9205                  folding when replacing arguments.  This is required as
9206                  otherwise you might end up with different statements than the
9207                  ones analyzed in vect_loop_analyze, leading to different
9208                  vectorization.  */
9209               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9210                                           &find_in_mapping, &mapping, false);
9211               gimple_set_op (stmt, j, op);
9212             }
9213         }
9214     }
9215
9216   struct data_reference *dr;
9217   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9218   FOR_EACH_VEC_ELT (datarefs, i, dr)
9219     {
9220       orig_stmt = DR_STMT (dr);
9221       gcc_assert (gimple_uid (orig_stmt) > 0);
9222       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9223       /* Data references for gather loads and scatter stores do not use the
9224          updated offset we set using ADVANCE.  Instead we have to make sure the
9225          reference in the data references point to the corresponding copy of
9226          the original in the epilogue.  */
9227       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9228           == VMAT_GATHER_SCATTER)
9229         {
9230           DR_REF (dr)
9231             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9232                                      &find_in_mapping, &mapping);
9233           DR_BASE_ADDRESS (dr)
9234             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9235                                      &find_in_mapping, &mapping);
9236         }
9237       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9238       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9239       /* The vector size of the epilogue is smaller than that of the main loop
9240          so the alignment is either the same or lower. This means the dr will
9241          thus by definition be aligned.  */
9242       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9243     }
9244
9245   epilogue_vinfo->shared->datarefs_copy.release ();
9246   epilogue_vinfo->shared->save_datarefs ();
9247 }
9248
9249 /* Function vect_transform_loop.
9250
9251    The analysis phase has determined that the loop is vectorizable.
9252    Vectorize the loop - created vectorized stmts to replace the scalar
9253    stmts in the loop, and update the loop exit condition.
9254    Returns scalar epilogue loop if any.  */
9255
9256 class loop *
9257 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9258 {
9259   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9260   class loop *epilogue = NULL;
9261   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9262   int nbbs = loop->num_nodes;
9263   int i;
9264   tree niters_vector = NULL_TREE;
9265   tree step_vector = NULL_TREE;
9266   tree niters_vector_mult_vf = NULL_TREE;
9267   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9268   unsigned int lowest_vf = constant_lower_bound (vf);
9269   gimple *stmt;
9270   bool check_profitability = false;
9271   unsigned int th;
9272
9273   DUMP_VECT_SCOPE ("vec_transform_loop");
9274
9275   loop_vinfo->shared->check_datarefs ();
9276
9277   /* Use the more conservative vectorization threshold.  If the number
9278      of iterations is constant assume the cost check has been performed
9279      by our caller.  If the threshold makes all loops profitable that
9280      run at least the (estimated) vectorization factor number of times
9281      checking is pointless, too.  */
9282   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9283   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9284     {
9285       if (dump_enabled_p ())
9286         dump_printf_loc (MSG_NOTE, vect_location,
9287                          "Profitability threshold is %d loop iterations.\n",
9288                          th);
9289       check_profitability = true;
9290     }
9291
9292   /* Make sure there exists a single-predecessor exit bb.  Do this before
9293      versioning.   */
9294   edge e = single_exit (loop);
9295   if (! single_pred_p (e->dest))
9296     {
9297       split_loop_exit_edge (e, true);
9298       if (dump_enabled_p ())
9299         dump_printf (MSG_NOTE, "split exit edge\n");
9300     }
9301
9302   /* Version the loop first, if required, so the profitability check
9303      comes first.  */
9304
9305   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9306     {
9307       class loop *sloop
9308         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9309       sloop->force_vectorize = false;
9310       check_profitability = false;
9311     }
9312
9313   /* Make sure there exists a single-predecessor exit bb also on the
9314      scalar loop copy.  Do this after versioning but before peeling
9315      so CFG structure is fine for both scalar and if-converted loop
9316      to make slpeel_duplicate_current_defs_from_edges face matched
9317      loop closed PHI nodes on the exit.  */
9318   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9319     {
9320       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9321       if (! single_pred_p (e->dest))
9322         {
9323           split_loop_exit_edge (e, true);
9324           if (dump_enabled_p ())
9325             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9326         }
9327     }
9328
9329   tree niters = vect_build_loop_niters (loop_vinfo);
9330   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9331   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9332   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9333   tree advance;
9334   drs_init_vec orig_drs_init;
9335
9336   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9337                               &step_vector, &niters_vector_mult_vf, th,
9338                               check_profitability, niters_no_overflow,
9339                               &advance);
9340
9341   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9342       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9343     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9344                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9345
9346   if (niters_vector == NULL_TREE)
9347     {
9348       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9349           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9350           && known_eq (lowest_vf, vf))
9351         {
9352           niters_vector
9353             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9354                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9355           step_vector = build_one_cst (TREE_TYPE (niters));
9356         }
9357       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9358         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9359                                      &step_vector, niters_no_overflow);
9360       else
9361         /* vect_do_peeling subtracted the number of peeled prologue
9362            iterations from LOOP_VINFO_NITERS.  */
9363         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9364                                      &niters_vector, &step_vector,
9365                                      niters_no_overflow);
9366     }
9367
9368   /* 1) Make sure the loop header has exactly two entries
9369      2) Make sure we have a preheader basic block.  */
9370
9371   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9372
9373   split_edge (loop_preheader_edge (loop));
9374
9375   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9376     /* This will deal with any possible peeling.  */
9377     vect_prepare_for_masked_peels (loop_vinfo);
9378
9379   /* Schedule the SLP instances first, then handle loop vectorization
9380      below.  */
9381   if (!loop_vinfo->slp_instances.is_empty ())
9382     {
9383       DUMP_VECT_SCOPE ("scheduling SLP instances");
9384       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9385     }
9386
9387   /* FORNOW: the vectorizer supports only loops which body consist
9388      of one basic block (header + empty latch). When the vectorizer will
9389      support more involved loop forms, the order by which the BBs are
9390      traversed need to be reconsidered.  */
9391
9392   for (i = 0; i < nbbs; i++)
9393     {
9394       basic_block bb = bbs[i];
9395       stmt_vec_info stmt_info;
9396
9397       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9398            gsi_next (&si))
9399         {
9400           gphi *phi = si.phi ();
9401           if (dump_enabled_p ())
9402             dump_printf_loc (MSG_NOTE, vect_location,
9403                              "------>vectorizing phi: %G", phi);
9404           stmt_info = loop_vinfo->lookup_stmt (phi);
9405           if (!stmt_info)
9406             continue;
9407
9408           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9409             vect_loop_kill_debug_uses (loop, stmt_info);
9410
9411           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9412               && !STMT_VINFO_LIVE_P (stmt_info))
9413             continue;
9414
9415           if (STMT_VINFO_VECTYPE (stmt_info)
9416               && (maybe_ne
9417                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9418               && dump_enabled_p ())
9419             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9420
9421           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9422                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9423                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9424                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9425                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9426               && ! PURE_SLP_STMT (stmt_info))
9427             {
9428               if (dump_enabled_p ())
9429                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9430               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9431             }
9432         }
9433
9434       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9435            gsi_next (&si))
9436         {
9437           gphi *phi = si.phi ();
9438           stmt_info = loop_vinfo->lookup_stmt (phi);
9439           if (!stmt_info)
9440             continue;
9441
9442           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9443               && !STMT_VINFO_LIVE_P (stmt_info))
9444             continue;
9445
9446           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9447                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9448                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9449                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9450                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9451               && ! PURE_SLP_STMT (stmt_info))
9452             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9453         }
9454
9455       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9456            !gsi_end_p (si);)
9457         {
9458           stmt = gsi_stmt (si);
9459           /* During vectorization remove existing clobber stmts.  */
9460           if (gimple_clobber_p (stmt))
9461             {
9462               unlink_stmt_vdef (stmt);
9463               gsi_remove (&si, true);
9464               release_defs (stmt);
9465             }
9466           else
9467             {
9468               /* Ignore vector stmts created in the outer loop.  */
9469               stmt_info = loop_vinfo->lookup_stmt (stmt);
9470
9471               /* vector stmts created in the outer-loop during vectorization of
9472                  stmts in an inner-loop may not have a stmt_info, and do not
9473                  need to be vectorized.  */
9474               stmt_vec_info seen_store = NULL;
9475               if (stmt_info)
9476                 {
9477                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9478                     {
9479                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9480                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9481                            !gsi_end_p (subsi); gsi_next (&subsi))
9482                         {
9483                           stmt_vec_info pat_stmt_info
9484                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9485                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9486                                                     &si, &seen_store);
9487                         }
9488                       stmt_vec_info pat_stmt_info
9489                         = STMT_VINFO_RELATED_STMT (stmt_info);
9490                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9491                                                 &seen_store);
9492                       maybe_set_vectorized_backedge_value (loop_vinfo,
9493                                                            pat_stmt_info);
9494                     }
9495                   else
9496                     {
9497                       vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9498                                                 &seen_store);
9499                       maybe_set_vectorized_backedge_value (loop_vinfo,
9500                                                            stmt_info);
9501                     }
9502                 }
9503               gsi_next (&si);
9504               if (seen_store)
9505                 {
9506                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9507                     /* Interleaving.  If IS_STORE is TRUE, the
9508                        vectorization of the interleaving chain was
9509                        completed - free all the stores in the chain.  */
9510                     vect_remove_stores (loop_vinfo,
9511                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9512                   else
9513                     /* Free the attached stmt_vec_info and remove the stmt.  */
9514                     loop_vinfo->remove_stmt (stmt_info);
9515                 }
9516             }
9517         }
9518
9519       /* Stub out scalar statements that must not survive vectorization.
9520          Doing this here helps with grouped statements, or statements that
9521          are involved in patterns.  */
9522       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9523            !gsi_end_p (gsi); gsi_next (&gsi))
9524         {
9525           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9526           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9527             {
9528               tree lhs = gimple_get_lhs (call);
9529               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9530                 {
9531                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9532                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9533                   gsi_replace (&gsi, new_stmt, true);
9534                 }
9535             }
9536         }
9537     }                           /* BBs in loop */
9538
9539   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9540      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9541   if (integer_onep (step_vector))
9542     niters_no_overflow = true;
9543   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9544                            niters_vector_mult_vf, !niters_no_overflow);
9545
9546   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9547   scale_profile_for_vect_loop (loop, assumed_vf);
9548
9549   /* True if the final iteration might not handle a full vector's
9550      worth of scalar iterations.  */
9551   bool final_iter_may_be_partial
9552     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9553   /* The minimum number of iterations performed by the epilogue.  This
9554      is 1 when peeling for gaps because we always need a final scalar
9555      iteration.  */
9556   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9557   /* +1 to convert latch counts to loop iteration counts,
9558      -min_epilogue_iters to remove iterations that cannot be performed
9559        by the vector code.  */
9560   int bias_for_lowest = 1 - min_epilogue_iters;
9561   int bias_for_assumed = bias_for_lowest;
9562   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9563   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9564     {
9565       /* When the amount of peeling is known at compile time, the first
9566          iteration will have exactly alignment_npeels active elements.
9567          In the worst case it will have at least one.  */
9568       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9569       bias_for_lowest += lowest_vf - min_first_active;
9570       bias_for_assumed += assumed_vf - min_first_active;
9571     }
9572   /* In these calculations the "- 1" converts loop iteration counts
9573      back to latch counts.  */
9574   if (loop->any_upper_bound)
9575     loop->nb_iterations_upper_bound
9576       = (final_iter_may_be_partial
9577          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9578                           lowest_vf) - 1
9579          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9580                            lowest_vf) - 1);
9581   if (loop->any_likely_upper_bound)
9582     loop->nb_iterations_likely_upper_bound
9583       = (final_iter_may_be_partial
9584          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9585                           + bias_for_lowest, lowest_vf) - 1
9586          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9587                            + bias_for_lowest, lowest_vf) - 1);
9588   if (loop->any_estimate)
9589     loop->nb_iterations_estimate
9590       = (final_iter_may_be_partial
9591          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9592                           assumed_vf) - 1
9593          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9594                            assumed_vf) - 1);
9595
9596   if (dump_enabled_p ())
9597     {
9598       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9599         {
9600           dump_printf_loc (MSG_NOTE, vect_location,
9601                            "LOOP VECTORIZED\n");
9602           if (loop->inner)
9603             dump_printf_loc (MSG_NOTE, vect_location,
9604                              "OUTER LOOP VECTORIZED\n");
9605           dump_printf (MSG_NOTE, "\n");
9606         }
9607       else
9608         dump_printf_loc (MSG_NOTE, vect_location,
9609                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9610                          GET_MODE_NAME (loop_vinfo->vector_mode));
9611     }
9612
9613   /* Loops vectorized with a variable factor won't benefit from
9614      unrolling/peeling.  */
9615   if (!vf.is_constant ())
9616     {
9617       loop->unroll = 1;
9618       if (dump_enabled_p ())
9619         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9620                          " variable-length vectorization factor\n");
9621     }
9622   /* Free SLP instances here because otherwise stmt reference counting
9623      won't work.  */
9624   slp_instance instance;
9625   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9626     vect_free_slp_instance (instance);
9627   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9628   /* Clear-up safelen field since its value is invalid after vectorization
9629      since vectorized loop can have loop-carried dependencies.  */
9630   loop->safelen = 0;
9631
9632   if (epilogue)
9633     {
9634       update_epilogue_loop_vinfo (epilogue, advance);
9635
9636       epilogue->simduid = loop->simduid;
9637       epilogue->force_vectorize = loop->force_vectorize;
9638       epilogue->dont_vectorize = false;
9639     }
9640
9641   return epilogue;
9642 }
9643
9644 /* The code below is trying to perform simple optimization - revert
9645    if-conversion for masked stores, i.e. if the mask of a store is zero
9646    do not perform it and all stored value producers also if possible.
9647    For example,
9648      for (i=0; i<n; i++)
9649        if (c[i])
9650         {
9651           p1[i] += 1;
9652           p2[i] = p3[i] +2;
9653         }
9654    this transformation will produce the following semi-hammock:
9655
9656    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9657      {
9658        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9659        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9660        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9661        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9662        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9663        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9664      }
9665 */
9666
9667 void
9668 optimize_mask_stores (class loop *loop)
9669 {
9670   basic_block *bbs = get_loop_body (loop);
9671   unsigned nbbs = loop->num_nodes;
9672   unsigned i;
9673   basic_block bb;
9674   class loop *bb_loop;
9675   gimple_stmt_iterator gsi;
9676   gimple *stmt;
9677   auto_vec<gimple *> worklist;
9678   auto_purge_vect_location sentinel;
9679
9680   vect_location = find_loop_location (loop);
9681   /* Pick up all masked stores in loop if any.  */
9682   for (i = 0; i < nbbs; i++)
9683     {
9684       bb = bbs[i];
9685       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9686            gsi_next (&gsi))
9687         {
9688           stmt = gsi_stmt (gsi);
9689           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9690             worklist.safe_push (stmt);
9691         }
9692     }
9693
9694   free (bbs);
9695   if (worklist.is_empty ())
9696     return;
9697
9698   /* Loop has masked stores.  */
9699   while (!worklist.is_empty ())
9700     {
9701       gimple *last, *last_store;
9702       edge e, efalse;
9703       tree mask;
9704       basic_block store_bb, join_bb;
9705       gimple_stmt_iterator gsi_to;
9706       tree vdef, new_vdef;
9707       gphi *phi;
9708       tree vectype;
9709       tree zero;
9710
9711       last = worklist.pop ();
9712       mask = gimple_call_arg (last, 2);
9713       bb = gimple_bb (last);
9714       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9715          the same loop as if_bb.  It could be different to LOOP when two
9716          level loop-nest is vectorized and mask_store belongs to the inner
9717          one.  */
9718       e = split_block (bb, last);
9719       bb_loop = bb->loop_father;
9720       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9721       join_bb = e->dest;
9722       store_bb = create_empty_bb (bb);
9723       add_bb_to_loop (store_bb, bb_loop);
9724       e->flags = EDGE_TRUE_VALUE;
9725       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9726       /* Put STORE_BB to likely part.  */
9727       efalse->probability = profile_probability::unlikely ();
9728       store_bb->count = efalse->count ();
9729       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9730       if (dom_info_available_p (CDI_DOMINATORS))
9731         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9732       if (dump_enabled_p ())
9733         dump_printf_loc (MSG_NOTE, vect_location,
9734                          "Create new block %d to sink mask stores.",
9735                          store_bb->index);
9736       /* Create vector comparison with boolean result.  */
9737       vectype = TREE_TYPE (mask);
9738       zero = build_zero_cst (vectype);
9739       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9740       gsi = gsi_last_bb (bb);
9741       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9742       /* Create new PHI node for vdef of the last masked store:
9743          .MEM_2 = VDEF <.MEM_1>
9744          will be converted to
9745          .MEM.3 = VDEF <.MEM_1>
9746          and new PHI node will be created in join bb
9747          .MEM_2 = PHI <.MEM_1, .MEM_3>
9748       */
9749       vdef = gimple_vdef (last);
9750       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9751       gimple_set_vdef (last, new_vdef);
9752       phi = create_phi_node (vdef, join_bb);
9753       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9754
9755       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9756       while (true)
9757         {
9758           gimple_stmt_iterator gsi_from;
9759           gimple *stmt1 = NULL;
9760
9761           /* Move masked store to STORE_BB.  */
9762           last_store = last;
9763           gsi = gsi_for_stmt (last);
9764           gsi_from = gsi;
9765           /* Shift GSI to the previous stmt for further traversal.  */
9766           gsi_prev (&gsi);
9767           gsi_to = gsi_start_bb (store_bb);
9768           gsi_move_before (&gsi_from, &gsi_to);
9769           /* Setup GSI_TO to the non-empty block start.  */
9770           gsi_to = gsi_start_bb (store_bb);
9771           if (dump_enabled_p ())
9772             dump_printf_loc (MSG_NOTE, vect_location,
9773                              "Move stmt to created bb\n%G", last);
9774           /* Move all stored value producers if possible.  */
9775           while (!gsi_end_p (gsi))
9776             {
9777               tree lhs;
9778               imm_use_iterator imm_iter;
9779               use_operand_p use_p;
9780               bool res;
9781
9782               /* Skip debug statements.  */
9783               if (is_gimple_debug (gsi_stmt (gsi)))
9784                 {
9785                   gsi_prev (&gsi);
9786                   continue;
9787                 }
9788               stmt1 = gsi_stmt (gsi);
9789               /* Do not consider statements writing to memory or having
9790                  volatile operand.  */
9791               if (gimple_vdef (stmt1)
9792                   || gimple_has_volatile_ops (stmt1))
9793                 break;
9794               gsi_from = gsi;
9795               gsi_prev (&gsi);
9796               lhs = gimple_get_lhs (stmt1);
9797               if (!lhs)
9798                 break;
9799
9800               /* LHS of vectorized stmt must be SSA_NAME.  */
9801               if (TREE_CODE (lhs) != SSA_NAME)
9802                 break;
9803
9804               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9805                 {
9806                   /* Remove dead scalar statement.  */
9807                   if (has_zero_uses (lhs))
9808                     {
9809                       gsi_remove (&gsi_from, true);
9810                       continue;
9811                     }
9812                 }
9813
9814               /* Check that LHS does not have uses outside of STORE_BB.  */
9815               res = true;
9816               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9817                 {
9818                   gimple *use_stmt;
9819                   use_stmt = USE_STMT (use_p);
9820                   if (is_gimple_debug (use_stmt))
9821                     continue;
9822                   if (gimple_bb (use_stmt) != store_bb)
9823                     {
9824                       res = false;
9825                       break;
9826                     }
9827                 }
9828               if (!res)
9829                 break;
9830
9831               if (gimple_vuse (stmt1)
9832                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9833                 break;
9834
9835               /* Can move STMT1 to STORE_BB.  */
9836               if (dump_enabled_p ())
9837                 dump_printf_loc (MSG_NOTE, vect_location,
9838                                  "Move stmt to created bb\n%G", stmt1);
9839               gsi_move_before (&gsi_from, &gsi_to);
9840               /* Shift GSI_TO for further insertion.  */
9841               gsi_prev (&gsi_to);
9842             }
9843           /* Put other masked stores with the same mask to STORE_BB.  */
9844           if (worklist.is_empty ()
9845               || gimple_call_arg (worklist.last (), 2) != mask
9846               || worklist.last () != stmt1)
9847             break;
9848           last = worklist.pop ();
9849         }
9850       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9851     }
9852 }
9853
9854 /* Decide whether it is possible to use a zero-based induction variable
9855    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9856    the value that the induction variable must be able to hold in order
9857    to ensure that the rgroups eventually have no active vector elements.
9858    Return -1 otherwise.  */
9859
9860 widest_int
9861 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9862 {
9863   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9864   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9865   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9866
9867   /* Calculate the value that the induction variable must be able
9868      to hit in order to ensure that we end the loop with an all-false mask.
9869      This involves adding the maximum number of inactive trailing scalar
9870      iterations.  */
9871   widest_int iv_limit = -1;
9872   if (max_loop_iterations (loop, &iv_limit))
9873     {
9874       if (niters_skip)
9875         {
9876           /* Add the maximum number of skipped iterations to the
9877              maximum iteration count.  */
9878           if (TREE_CODE (niters_skip) == INTEGER_CST)
9879             iv_limit += wi::to_widest (niters_skip);
9880           else
9881             iv_limit += max_vf - 1;
9882         }
9883       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9884         /* Make a conservatively-correct assumption.  */
9885         iv_limit += max_vf - 1;
9886
9887       /* IV_LIMIT is the maximum number of latch iterations, which is also
9888          the maximum in-range IV value.  Round this value down to the previous
9889          vector alignment boundary and then add an extra full iteration.  */
9890       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9891       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9892     }
9893   return iv_limit;
9894 }
9895
9896 /* For the given rgroup_controls RGC, check whether an induction variable
9897    would ever hit a value that produces a set of all-false masks or zero
9898    lengths before wrapping around.  Return true if it's possible to wrap
9899    around before hitting the desirable value, otherwise return false.  */
9900
9901 bool
9902 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9903 {
9904   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9905
9906   if (iv_limit == -1)
9907     return true;
9908
9909   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9910   unsigned int compare_precision = TYPE_PRECISION (compare_type);
9911   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9912
9913   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9914     return true;
9915
9916   return false;
9917 }