gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     {
 670       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 671       while (next)
 672         {
 673           if ((STMT_VINFO_IN_PATTERN_P (next)
 674                != STMT_VINFO_IN_PATTERN_P (first))
 675               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 676             break;
 677           next = REDUC_GROUP_NEXT_ELEMENT (next);
 678         }
 679       /* If all reduction chain members are well-formed patterns adjust
 680          the group to group the pattern stmts instead.  */
 681       if (! next
 682           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 683         {
 684           if (STMT_VINFO_IN_PATTERN_P (first))
 685             {
 686               vect_fixup_reduc_chain (first);
 687               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 688                 = STMT_VINFO_RELATED_STMT (first);
 689             }
 690         }
 691       /* If not all stmt in the chain are patterns or if we failed
 692          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 693          it as regular reduction instead.  */
 694       else
 695         {
 696           stmt_vec_info vinfo = first;
 697           stmt_vec_info last = NULL;
 698           while (vinfo)
 699             {
 700               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 701               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 702               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 703               last = vinfo;
 704               vinfo = next;
 705             }
 706           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 707             = vect_internal_def;
 708           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 709           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 710           --i;
 711         }
 712     }
 713 }
 714
 715 /* Function vect_get_loop_niters.
 716
 717    Determine how many iterations the loop is executed and place it
 718    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 719    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 720    niter information holds in ASSUMPTIONS.
 721
 722    Return the loop exit condition.  */
 723
 724
 725 static gcond *
 726 vect_get_loop_niters (class loop *loop, tree *assumptions,
 727                       tree *number_of_iterations, tree *number_of_iterationsm1)
 728 {
 729   edge exit = single_exit (loop);
 730   class tree_niter_desc niter_desc;
 731   tree niter_assumptions, niter, may_be_zero;
 732   gcond *cond = get_loop_exit_condition (loop);
 733
 734   *assumptions = boolean_true_node;
 735   *number_of_iterationsm1 = chrec_dont_know;
 736   *number_of_iterations = chrec_dont_know;
 737   DUMP_VECT_SCOPE ("get_loop_niters");
 738
 739   if (!exit)
 740     return cond;
 741
 742   may_be_zero = NULL_TREE;
 743   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 744       || chrec_contains_undetermined (niter_desc.niter))
 745     return cond;
 746
 747   niter_assumptions = niter_desc.assumptions;
 748   may_be_zero = niter_desc.may_be_zero;
 749   niter = niter_desc.niter;
 750
 751   if (may_be_zero && integer_zerop (may_be_zero))
 752     may_be_zero = NULL_TREE;
 753
 754   if (may_be_zero)
 755     {
 756       if (COMPARISON_CLASS_P (may_be_zero))
 757         {
 758           /* Try to combine may_be_zero with assumptions, this can simplify
 759              computation of niter expression.  */
 760           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 761             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 762                                              niter_assumptions,
 763                                              fold_build1 (TRUTH_NOT_EXPR,
 764                                                           boolean_type_node,
 765                                                           may_be_zero));
 766           else
 767             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 768                                  build_int_cst (TREE_TYPE (niter), 0),
 769                                  rewrite_to_non_trapping_overflow (niter));
 770
 771           may_be_zero = NULL_TREE;
 772         }
 773       else if (integer_nonzerop (may_be_zero))
 774         {
 775           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 776           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 777           return cond;
 778         }
 779       else
 780         return cond;
 781     }
 782
 783   *assumptions = niter_assumptions;
 784   *number_of_iterationsm1 = niter;
 785
 786   /* We want the number of loop header executions which is the number
 787      of latch executions plus one.
 788      ???  For UINT_MAX latch executions this number overflows to zero
 789      for loops like do { n++; } while (n != 0);  */
 790   if (niter && !chrec_contains_undetermined (niter))
 791     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 792                           build_int_cst (TREE_TYPE (niter), 1));
 793   *number_of_iterations = niter;
 794
 795   return cond;
 796 }
 797
 798 /* Function bb_in_loop_p
 799
 800    Used as predicate for dfs order traversal of the loop bbs.  */
 801
 802 static bool
 803 bb_in_loop_p (const_basic_block bb, const void *data)
 804 {
 805   const class loop *const loop = (const class loop *)data;
 806   if (flow_bb_inside_loop_p (loop, bb))
 807     return true;
 808   return false;
 809 }
 810
 811
 812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 813    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 814
 815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 816   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 817     loop (loop_in),
 818     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 819     num_itersm1 (NULL_TREE),
 820     num_iters (NULL_TREE),
 821     num_iters_unchanged (NULL_TREE),
 822     num_iters_assumptions (NULL_TREE),
 823     th (0),
 824     versioning_threshold (0),
 825     vectorization_factor (0),
 826     max_vectorization_factor (0),
 827     mask_skip_niters (NULL_TREE),
 828     rgroup_compare_type (NULL_TREE),
 829     simd_if_cond (NULL_TREE),
 830     unaligned_dr (NULL),
 831     peeling_for_alignment (0),
 832     ptr_mask (0),
 833     ivexpr_map (NULL),
 834     scan_map (NULL),
 835     slp_unrolling_factor (1),
 836     single_scalar_iteration_cost (0),
 837     vec_outside_cost (0),
 838     vec_inside_cost (0),
 839     vectorizable (false),
 840     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 841     using_partial_vectors_p (false),
 842     epil_using_partial_vectors_p (false),
 843     peeling_for_gaps (false),
 844     peeling_for_niter (false),
 845     no_data_dependencies (false),
 846     has_mask_store (false),
 847     scalar_loop_scaling (profile_probability::uninitialized ()),
 848     scalar_loop (NULL),
 849     orig_loop_info (NULL)
 850 {
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 857                                           bbs, loop->num_nodes, loop);
 858   gcc_assert (nbbs == loop->num_nodes);
 859
 860   for (unsigned int i = 0; i < nbbs; i++)
 861     {
 862       basic_block bb = bbs[i];
 863       gimple_stmt_iterator si;
 864
 865       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 866         {
 867           gimple *phi = gsi_stmt (si);
 868           gimple_set_uid (phi, 0);
 869           add_stmt (phi);
 870         }
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 873         {
 874           gimple *stmt = gsi_stmt (si);
 875           gimple_set_uid (stmt, 0);
 876           if (is_gimple_debug (stmt))
 877             continue;
 878           add_stmt (stmt);
 879           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 880              third argument is the #pragma omp simd if (x) condition, when 0,
 881              loop shouldn't be vectorized, when non-zero constant, it should
 882              be vectorized normally, otherwise versioned with vectorized loop
 883              done if the condition is non-zero at runtime.  */
 884           if (loop_in->simduid
 885               && is_gimple_call (stmt)
 886               && gimple_call_internal_p (stmt)
 887               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 888               && gimple_call_num_args (stmt) >= 3
 889               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 890               && (loop_in->simduid
 891                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 892             {
 893               tree arg = gimple_call_arg (stmt, 2);
 894               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 895                 simd_if_cond = arg;
 896               else
 897                 gcc_assert (integer_nonzerop (arg));
 898             }
 899         }
 900     }
 901
 902   epilogue_vinfos.create (6);
 903 }
 904
 905 /* Free all levels of rgroup CONTROLS.  */
 906
 907 void
 908 release_vec_loop_controls (vec<rgroup_controls> *controls)
 909 {
 910   rgroup_controls *rgc;
 911   unsigned int i;
 912   FOR_EACH_VEC_ELT (*controls, i, rgc)
 913     rgc->controls.release ();
 914   controls->release ();
 915 }
 916
 917 /* Free all memory used by the _loop_vec_info, as well as all the
 918    stmt_vec_info structs of all the stmts in the loop.  */
 919
 920 _loop_vec_info::~_loop_vec_info ()
 921 {
 922   free (bbs);
 923
 924   release_vec_loop_controls (&masks);
 925   release_vec_loop_controls (&lens);
 926   delete ivexpr_map;
 927   delete scan_map;
 928   epilogue_vinfos.release ();
 929
 930   loop->aux = NULL;
 931 }
 932
 933 /* Return an invariant or register for EXPR and emit necessary
 934    computations in the LOOP_VINFO loop preheader.  */
 935
 936 tree
 937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 938 {
 939   if (is_gimple_reg (expr)
 940       || is_gimple_min_invariant (expr))
 941     return expr;
 942
 943   if (! loop_vinfo->ivexpr_map)
 944     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 945   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 946   if (! cached)
 947     {
 948       gimple_seq stmts = NULL;
 949       cached = force_gimple_operand (unshare_expr (expr),
 950                                      &stmts, true, NULL_TREE);
 951       if (stmts)
 952         {
 953           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 954           gsi_insert_seq_on_edge_immediate (e, stmts);
 955         }
 956     }
 957   return cached;
 958 }
 959
 960 /* Return true if we can use CMP_TYPE as the comparison type to produce
 961    all masks required to mask LOOP_VINFO.  */
 962
 963 static bool
 964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 965 {
 966   rgroup_controls *rgm;
 967   unsigned int i;
 968   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 969     if (rgm->type != NULL_TREE
 970         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 971                                             cmp_type, rgm->type,
 972                                             OPTIMIZE_FOR_SPEED))
 973       return false;
 974   return true;
 975 }
 976
 977 /* Calculate the maximum number of scalars per iteration for every
 978    rgroup in LOOP_VINFO.  */
 979
 980 static unsigned int
 981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 982 {
 983   unsigned int res = 1;
 984   unsigned int i;
 985   rgroup_controls *rgm;
 986   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 987     res = MAX (res, rgm->max_nscalars_per_iter);
 988   return res;
 989 }
 990
 991 /* Calculate the minimum precision necessary to represent:
 992
 993       MAX_NITERS * FACTOR
 994
 995    as an unsigned integer, where MAX_NITERS is the maximum number of
 996    loop header iterations for the original scalar form of LOOP_VINFO.  */
 997
 998 static unsigned
 999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1000 {
1001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1002
1003   /* Get the maximum number of iterations that is representable
1004      in the counter type.  */
1005   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1007
1008   /* Get a more refined estimate for the number of iterations.  */
1009   widest_int max_back_edges;
1010   if (max_loop_iterations (loop, &max_back_edges))
1011     max_ni = wi::smin (max_ni, max_back_edges + 1);
1012
1013   /* Work out how many bits we need to represent the limit.  */
1014   return wi::min_precision (max_ni * factor, UNSIGNED);
1015 }
1016
1017 /* True if the loop needs peeling or partial vectors when vectorized.  */
1018
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1021 {
1022   unsigned HOST_WIDE_INT const_vf;
1023   HOST_WIDE_INT max_niter
1024     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1025
1026   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029                                           (loop_vinfo));
1030
1031   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1033     {
1034       /* Work out the (constant) number of iterations that need to be
1035          peeled for reasons other than niters.  */
1036       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038         peel_niter += 1;
1039       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041         return true;
1042     }
1043   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044       /* ??? When peeling for gaps but not alignment, we could
1045          try to check whether the (variable) niters is known to be
1046          VF * N + 1.  That's something of a niche case though.  */
1047       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050            < (unsigned) exact_log2 (const_vf))
1051           /* In case of versioning, check if the maximum number of
1052              iterations is greater than th.  If they are identical,
1053              the epilogue is unnecessary.  */
1054           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055               || ((unsigned HOST_WIDE_INT) max_niter
1056                   > (th / const_vf) * const_vf))))
1057     return true;
1058
1059   return false;
1060 }
1061
1062 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1063    whether we can actually generate the masks required.  Return true if so,
1064    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1065
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1068 {
1069   unsigned int min_ni_width;
1070   unsigned int max_nscalars_per_iter
1071     = vect_get_max_nscalars_per_iter (loop_vinfo);
1072
1073   /* Use a normal loop if there are no statements that need masking.
1074      This only happens in rare degenerate cases: it means that the loop
1075      has no loads, no stores, and no live-out values.  */
1076   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077     return false;
1078
1079   /* Work out how many bits we need to represent the limit.  */
1080   min_ni_width
1081     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1082
1083   /* Find a scalar mode for which WHILE_ULT is supported.  */
1084   opt_scalar_int_mode cmp_mode_iter;
1085   tree cmp_type = NULL_TREE;
1086   tree iv_type = NULL_TREE;
1087   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088   unsigned int iv_precision = UINT_MAX;
1089
1090   if (iv_limit != -1)
1091     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092                                       UNSIGNED);
1093
1094   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1095     {
1096       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097       if (cmp_bits >= min_ni_width
1098           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1099         {
1100           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101           if (this_type
1102               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1103             {
1104               /* Although we could stop as soon as we find a valid mode,
1105                  there are at least two reasons why that's not always the
1106                  best choice:
1107
1108                  - An IV that's Pmode or wider is more likely to be reusable
1109                    in address calculations than an IV that's narrower than
1110                    Pmode.
1111
1112                  - Doing the comparison in IV_PRECISION or wider allows
1113                    a natural 0-based IV, whereas using a narrower comparison
1114                    type requires mitigations against wrap-around.
1115
1116                  Conversely, if the IV limit is variable, doing the comparison
1117                  in a wider type than the original type can introduce
1118                  unnecessary extensions, so picking the widest valid mode
1119                  is not always a good choice either.
1120
1121                  Here we prefer the first IV type that's Pmode or wider,
1122                  and the first comparison type that's IV_PRECISION or wider.
1123                  (The comparison type must be no wider than the IV type,
1124                  to avoid extensions in the vector loop.)
1125
1126                  ??? We might want to try continuing beyond Pmode for ILP32
1127                  targets if CMP_BITS < IV_PRECISION.  */
1128               iv_type = this_type;
1129               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130                 cmp_type = this_type;
1131               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132                 break;
1133             }
1134         }
1135     }
1136
1137   if (!cmp_type)
1138     return false;
1139
1140   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142   return true;
1143 }
1144
1145 /* Check whether we can use vector access with length based on precison
1146    comparison.  So far, to keep it simple, we only allow the case that the
1147    precision of the target supported length is larger than the precision
1148    required by loop niters.  */
1149
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1152 {
1153   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154     return false;
1155
1156   unsigned int max_nitems_per_iter = 1;
1157   unsigned int i;
1158   rgroup_controls *rgl;
1159   /* Find the maximum number of items per iteration for every rgroup.  */
1160   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1161     {
1162       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1164     }
1165
1166   /* Work out how many bits we need to represent the length limit.  */
1167   unsigned int min_ni_prec
1168     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1169
1170   /* Now use the maximum of below precisions for one suitable IV type:
1171      - the IV's natural precision
1172      - the precision needed to hold: the maximum number of scalar
1173        iterations multiplied by the scale factor (min_ni_prec above)
1174      - the Pmode precision
1175
1176      If min_ni_prec is less than the precision of the current niters,
1177      we perfer to still use the niters type.  Prefer to use Pmode and
1178      wider IV to avoid narrow conversions.  */
1179
1180   unsigned int ni_prec
1181     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182   min_ni_prec = MAX (min_ni_prec, ni_prec);
1183   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1184
1185   tree iv_type = NULL_TREE;
1186   opt_scalar_int_mode tmode_iter;
1187   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1188     {
1189       scalar_mode tmode = tmode_iter.require ();
1190       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1191
1192       /* ??? Do we really want to construct one IV whose precision exceeds
1193          BITS_PER_WORD?  */
1194       if (tbits > BITS_PER_WORD)
1195         break;
1196
1197       /* Find the first available standard integral type.  */
1198       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1199         {
1200           iv_type = build_nonstandard_integer_type (tbits, true);
1201           break;
1202         }
1203     }
1204
1205   if (!iv_type)
1206     {
1207       if (dump_enabled_p ())
1208         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                          "can't vectorize with length-based partial vectors"
1210                          " because there is no suitable iv type.\n");
1211       return false;
1212     }
1213
1214   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1216
1217   return true;
1218 }
1219
1220 /* Calculate the cost of one scalar iteration of the loop.  */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1223 {
1224   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226   int nbbs = loop->num_nodes, factor;
1227   int innerloop_iters, i;
1228
1229   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1230
1231   /* Gather costs for statements in the scalar loop.  */
1232
1233   /* FORNOW.  */
1234   innerloop_iters = 1;
1235   if (loop->inner)
1236     innerloop_iters = 50; /* FIXME */
1237
1238   for (i = 0; i < nbbs; i++)
1239     {
1240       gimple_stmt_iterator si;
1241       basic_block bb = bbs[i];
1242
1243       if (bb->loop_father == loop->inner)
1244         factor = innerloop_iters;
1245       else
1246         factor = 1;
1247
1248       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249         {
1250           gimple *stmt = gsi_stmt (si);
1251           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1252
1253           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254             continue;
1255
1256           /* Skip stmts that are not vectorized inside the loop.  */
1257           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259               && (!STMT_VINFO_LIVE_P (vstmt_info)
1260                   || !VECTORIZABLE_CYCLE_DEF
1261                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262             continue;
1263
1264           vect_cost_for_stmt kind;
1265           if (STMT_VINFO_DATA_REF (stmt_info))
1266             {
1267               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268                kind = scalar_load;
1269              else
1270                kind = scalar_store;
1271             }
1272           else if (vect_nop_conversion_p (stmt_info))
1273             continue;
1274           else
1275             kind = scalar_stmt;
1276
1277           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278                             factor, kind, stmt_info, 0, vect_prologue);
1279         }
1280     }
1281
1282   /* Now accumulate cost.  */
1283   void *target_cost_data = init_cost (loop);
1284   stmt_info_for_cost *si;
1285   int j;
1286   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287                     j, si)
1288     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289                           si->kind, si->stmt_info, si->vectype,
1290                           si->misalign, vect_body);
1291   unsigned dummy, body_cost = 0;
1292   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293   destroy_cost_data (target_cost_data);
1294   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1295 }
1296
1297
1298 /* Function vect_analyze_loop_form_1.
1299
1300    Verify that certain CFG restrictions hold, including:
1301    - the loop has a pre-header
1302    - the loop has a single entry and exit
1303    - the loop exit condition is simple enough
1304    - the number of iterations can be analyzed, i.e, a countable loop.  The
1305      niter could be analyzed under some assumptions.  */
1306
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309                           tree *assumptions, tree *number_of_iterationsm1,
1310                           tree *number_of_iterations, gcond **inner_loop_cond)
1311 {
1312   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1313
1314   /* Different restrictions apply when we are considering an inner-most loop,
1315      vs. an outer (nested) loop.
1316      (FORNOW. May want to relax some of these restrictions in the future).  */
1317
1318   if (!loop->inner)
1319     {
1320       /* Inner-most loop.  We currently require that the number of BBs is
1321          exactly 2 (the header and latch).  Vectorizable inner-most loops
1322          look like this:
1323
1324                         (pre-header)
1325                            |
1326                           header <--------+
1327                            | |            |
1328                            | +--> latch --+
1329                            |
1330                         (exit-bb)  */
1331
1332       if (loop->num_nodes != 2)
1333         return opt_result::failure_at (vect_location,
1334                                        "not vectorized:"
1335                                        " control flow in loop.\n");
1336
1337       if (empty_block_p (loop->header))
1338         return opt_result::failure_at (vect_location,
1339                                        "not vectorized: empty loop.\n");
1340     }
1341   else
1342     {
1343       class loop *innerloop = loop->inner;
1344       edge entryedge;
1345
1346       /* Nested loop. We currently require that the loop is doubly-nested,
1347          contains a single inner loop, and the number of BBs is exactly 5.
1348          Vectorizable outer-loops look like this:
1349
1350                         (pre-header)
1351                            |
1352                           header <---+
1353                            |         |
1354                           inner-loop |
1355                            |         |
1356                           tail ------+
1357                            |
1358                         (exit-bb)
1359
1360          The inner-loop has the properties expected of inner-most loops
1361          as described above.  */
1362
1363       if ((loop->inner)->inner || (loop->inner)->next)
1364         return opt_result::failure_at (vect_location,
1365                                        "not vectorized:"
1366                                        " multiple nested loops.\n");
1367
1368       if (loop->num_nodes != 5)
1369         return opt_result::failure_at (vect_location,
1370                                        "not vectorized:"
1371                                        " control flow in loop.\n");
1372
1373       entryedge = loop_preheader_edge (innerloop);
1374       if (entryedge->src != loop->header
1375           || !single_exit (innerloop)
1376           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377         return opt_result::failure_at (vect_location,
1378                                        "not vectorized:"
1379                                        " unsupported outerloop form.\n");
1380
1381       /* Analyze the inner-loop.  */
1382       tree inner_niterm1, inner_niter, inner_assumptions;
1383       opt_result res
1384         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385                                     &inner_assumptions, &inner_niterm1,
1386                                     &inner_niter, NULL);
1387       if (!res)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: Bad inner loop.\n");
1392           return res;
1393         }
1394
1395       /* Don't support analyzing niter under assumptions for inner
1396          loop.  */
1397       if (!integer_onep (inner_assumptions))
1398         return opt_result::failure_at (vect_location,
1399                                        "not vectorized: Bad inner loop.\n");
1400
1401       if (!expr_invariant_in_loop_p (loop, inner_niter))
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized: inner-loop count not"
1404                                        " invariant.\n");
1405
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Considering outer-loop vectorization.\n");
1409     }
1410
1411   if (!single_exit (loop))
1412     return opt_result::failure_at (vect_location,
1413                                    "not vectorized: multiple exits.\n");
1414   if (EDGE_COUNT (loop->header->preds) != 2)
1415     return opt_result::failure_at (vect_location,
1416                                    "not vectorized:"
1417                                    " too many incoming edges.\n");
1418
1419   /* We assume that the loop exit condition is at the end of the loop. i.e,
1420      that the loop is represented as a do-while (with a proper if-guard
1421      before the loop if needed), where the loop header contains all the
1422      executable statements, and the latch is empty.  */
1423   if (!empty_block_p (loop->latch)
1424       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425     return opt_result::failure_at (vect_location,
1426                                    "not vectorized: latch block not empty.\n");
1427
1428   /* Make sure the exit is not abnormal.  */
1429   edge e = single_exit (loop);
1430   if (e->flags & EDGE_ABNORMAL)
1431     return opt_result::failure_at (vect_location,
1432                                    "not vectorized:"
1433                                    " abnormal loop exit edge.\n");
1434
1435   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436                                      number_of_iterationsm1);
1437   if (!*loop_cond)
1438     return opt_result::failure_at
1439       (vect_location,
1440        "not vectorized: complicated exit condition.\n");
1441
1442   if (integer_zerop (*assumptions)
1443       || !*number_of_iterations
1444       || chrec_contains_undetermined (*number_of_iterations))
1445     return opt_result::failure_at
1446       (*loop_cond,
1447        "not vectorized: number of iterations cannot be computed.\n");
1448
1449   if (integer_zerop (*number_of_iterations))
1450     return opt_result::failure_at
1451       (*loop_cond,
1452        "not vectorized: number of iterations = 0.\n");
1453
1454   return opt_result::success ();
1455 }
1456
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1458
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1461 {
1462   tree assumptions, number_of_iterations, number_of_iterationsm1;
1463   gcond *loop_cond, *inner_loop_cond = NULL;
1464
1465   opt_result res
1466     = vect_analyze_loop_form_1 (loop, &loop_cond,
1467                                 &assumptions, &number_of_iterationsm1,
1468                                 &number_of_iterations, &inner_loop_cond);
1469   if (!res)
1470     return opt_loop_vec_info::propagate_failure (res);
1471
1472   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476   if (!integer_onep (assumptions))
1477     {
1478       /* We consider to vectorize this loop by versioning it under
1479          some assumptions.  In order to do this, we need to clear
1480          existing information computed by scev and niter analyzer.  */
1481       scev_reset_htab ();
1482       free_numbers_of_iterations_estimates (loop);
1483       /* Also set flag for this loop so that following scev and niter
1484          analysis are done under the assumptions.  */
1485       loop_constraint_set (loop, LOOP_C_FINITE);
1486       /* Also record the assumptions for versioning.  */
1487       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1488     }
1489
1490   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1491     {
1492       if (dump_enabled_p ())
1493         {
1494           dump_printf_loc (MSG_NOTE, vect_location,
1495                            "Symbolic number of iterations is ");
1496           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497           dump_printf (MSG_NOTE, "\n");
1498         }
1499     }
1500
1501   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503   if (inner_loop_cond)
1504     {
1505       stmt_vec_info inner_loop_cond_info
1506         = loop_vinfo->lookup_stmt (inner_loop_cond);
1507       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508     }
1509
1510   gcc_assert (!loop->aux);
1511   loop->aux = loop_vinfo;
1512   return opt_loop_vec_info::success (loop_vinfo);
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518    statements update the vectorization factor.  */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525   int nbbs = loop->num_nodes;
1526   poly_uint64 vectorization_factor;
1527   int i;
1528
1529   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532   gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535      vectorization factor of the loop is the unrolling factor required by
1536      the SLP instances.  If that unrolling factor is 1, we say, that we
1537      perform pure SLP on loop - cross iteration parallelism is not
1538      exploited.  */
1539   bool only_slp_in_loop = true;
1540   for (i = 0; i < nbbs; i++)
1541     {
1542       basic_block bb = bbs[i];
1543       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544            gsi_next (&si))
1545         {
1546           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547           if (!stmt_info)
1548             continue;
1549           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551               && !PURE_SLP_STMT (stmt_info))
1552             /* STMT needs both SLP and loop-based vectorization.  */
1553             only_slp_in_loop = false;
1554         }
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           if (is_gimple_debug (gsi_stmt (si)))
1559             continue;
1560           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561           stmt_info = vect_stmt_to_vectorize (stmt_info);
1562           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564               && !PURE_SLP_STMT (stmt_info))
1565             /* STMT needs both SLP and loop-based vectorization.  */
1566             only_slp_in_loop = false;
1567         }
1568     }
1569
1570   if (only_slp_in_loop)
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "Loop contains only SLP stmts\n");
1575       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576     }
1577   else
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "Loop contains SLP and non-SLP stmts\n");
1582       /* Both the vectorization factor and unroll factor have the form
1583          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584          so they must have a common multiple.  */
1585       vectorization_factor
1586         = force_common_multiple (vectorization_factor,
1587                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588     }
1589
1590   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591   if (dump_enabled_p ())
1592     {
1593       dump_printf_loc (MSG_NOTE, vect_location,
1594                        "Updating vectorization factor to ");
1595       dump_dec (MSG_NOTE, vectorization_factor);
1596       dump_printf (MSG_NOTE, ".\n");
1597     }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601    the other phi in the reduction is also relevant for vectorization.
1602    This rejects cases such as:
1603
1604       outer1:
1605         x_1 = PHI <x_3(outer2), ...>;
1606         ...
1607
1608       inner:
1609         x_2 = ...;
1610         ...
1611
1612       outer2:
1613         x_3 = PHI <x_2(inner)>;
1614
1615    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621     return false;
1622
1623   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628    Scan the loop stmts and make sure they are all vectorizable.  */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635   int nbbs = loop->num_nodes;
1636   int i;
1637   stmt_vec_info stmt_info;
1638   bool need_to_vectorize = false;
1639   bool ok;
1640
1641   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643   auto_vec<stmt_info_for_cost> cost_vec;
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       basic_block bb = bbs[i];
1648
1649       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650            gsi_next (&si))
1651         {
1652           gphi *phi = si.phi ();
1653           ok = true;
1654
1655           stmt_info = loop_vinfo->lookup_stmt (phi);
1656           if (dump_enabled_p ())
1657             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && !vect_active_double_reduction_p (stmt_info))
1671                 return opt_result::failure_at (phi,
1672                                                "Unsupported loop-closed phi"
1673                                                " in outer-loop.\n");
1674
1675               /* If PHI is used in the outer loop, we check that its operand
1676                  is defined in the inner loop.  */
1677               if (STMT_VINFO_RELEVANT_P (stmt_info))
1678                 {
1679                   tree phi_op;
1680
1681                   if (gimple_phi_num_args (phi) != 1)
1682                     return opt_result::failure_at (phi, "unsupported phi");
1683
1684                   phi_op = PHI_ARG_DEF (phi, 0);
1685                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686                   if (!op_def_info)
1687                     return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690                       && (STMT_VINFO_RELEVANT (op_def_info)
1691                           != vect_used_in_outer_by_reduction))
1692                     return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1696                            == vect_double_reduction_def))
1697                       && !vectorizable_lc_phi (loop_vinfo,
1698                                                stmt_info, NULL, NULL))
1699                     return opt_result::failure_at (phi, "unsupported phi\n");
1700                 }
1701
1702               continue;
1703             }
1704
1705           gcc_assert (stmt_info);
1706
1707           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708                || STMT_VINFO_LIVE_P (stmt_info))
1709               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710             /* A scalar-dependence cycle that we don't support.  */
1711             return opt_result::failure_at (phi,
1712                                            "not vectorized:"
1713                                            " scalar dependence cycle.\n");
1714
1715           if (STMT_VINFO_RELEVANT_P (stmt_info))
1716             {
1717               need_to_vectorize = true;
1718               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719                   && ! PURE_SLP_STMT (stmt_info))
1720                 ok = vectorizable_induction (loop_vinfo,
1721                                              stmt_info, NULL, NULL,
1722                                              &cost_vec);
1723               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1725                             == vect_double_reduction_def)
1726                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727                        && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_reduction (loop_vinfo,
1729                                              stmt_info, NULL, NULL, &cost_vec);
1730             }
1731
1732           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1733           if (ok
1734               && STMT_VINFO_LIVE_P (stmt_info)
1735               && !PURE_SLP_STMT (stmt_info))
1736             ok = vectorizable_live_operation (loop_vinfo,
1737                                               stmt_info, NULL, NULL, NULL,
1738                                               -1, false, &cost_vec);
1739
1740           if (!ok)
1741             return opt_result::failure_at (phi,
1742                                            "not vectorized: relevant phi not "
1743                                            "supported: %G",
1744                                            static_cast <gimple *> (phi));
1745         }
1746
1747       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748            gsi_next (&si))
1749         {
1750           gimple *stmt = gsi_stmt (si);
1751           if (!gimple_clobber_p (stmt)
1752               && !is_gimple_debug (stmt))
1753             {
1754               opt_result res
1755                 = vect_analyze_stmt (loop_vinfo,
1756                                      loop_vinfo->lookup_stmt (stmt),
1757                                      &need_to_vectorize,
1758                                      NULL, NULL, &cost_vec);
1759               if (!res)
1760                 return res;
1761             }
1762         }
1763     } /* bbs */
1764
1765   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1766
1767   /* All operations in the loop are either irrelevant (deal with loop
1768      control, or dead), or only used outside the loop and can be moved
1769      out of the loop (e.g. invariants, inductions).  The loop can be
1770      optimized away by scalar optimizations.  We're better off not
1771      touching this loop.  */
1772   if (!need_to_vectorize)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "All the computation can be taken out of the loop.\n");
1777       return opt_result::failure_at
1778         (vect_location,
1779          "not vectorized: redundant loop. no profit to vectorize.\n");
1780     }
1781
1782   return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786    vectorization factor.  Return false if it isn't, or if we can't be sure
1787    either way.  */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794   HOST_WIDE_INT max_niter;
1795   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797   else
1798     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1807    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1808    definitely no, or -1 if it's worth retrying.  */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816   /* Only loops that can handle partially-populated vectors can have iteration
1817      counts less than the vectorization factor.  */
1818   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819     {
1820       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: iteration count smaller than "
1825                              "vectorization factor.\n");
1826           return 0;
1827         }
1828     }
1829
1830   int min_profitable_iters, min_profitable_estimate;
1831   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1832                                       &min_profitable_estimate);
1833
1834   if (min_profitable_iters < 0)
1835     {
1836       if (dump_enabled_p ())
1837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                          "not vectorized: vectorization not profitable.\n");
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841                          "not vectorized: vector version will never be "
1842                          "profitable.\n");
1843       return -1;
1844     }
1845
1846   int min_scalar_loop_bound = (param_min_vect_loop_bound
1847                                * assumed_vf);
1848
1849   /* Use the cost model only if it is more conservative than user specified
1850      threshold.  */
1851   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1852                                     min_profitable_iters);
1853
1854   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1855
1856   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1857       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "not vectorized: vectorization not profitable.\n");
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_NOTE, vect_location,
1864                          "not vectorized: iteration count smaller than user "
1865                          "specified loop bound parameter or minimum profitable "
1866                          "iterations (whichever is more conservative).\n");
1867       return 0;
1868     }
1869
1870   /* The static profitablity threshold min_profitable_estimate includes
1871      the cost of having to check at runtime whether the scalar loop
1872      should be used instead.  If it turns out that we don't need or want
1873      such a check, the threshold we should use for the static estimate
1874      is simply the point at which the vector loop becomes more profitable
1875      than the scalar loop.  */
1876   if (min_profitable_estimate > min_profitable_iters
1877       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1878       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1879       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1884                          " choice between the scalar and vector loops\n");
1885       min_profitable_estimate = min_profitable_iters;
1886     }
1887
1888   HOST_WIDE_INT estimated_niter;
1889
1890   /* If we are vectorizing an epilogue then we know the maximum number of
1891      scalar iterations it will cover is at least one lower than the
1892      vectorization factor of the main loop.  */
1893   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1894     estimated_niter
1895       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1896   else
1897     {
1898       estimated_niter = estimated_stmt_executions_int (loop);
1899       if (estimated_niter == -1)
1900         estimated_niter = likely_max_stmt_executions_int (loop);
1901     }
1902   if (estimated_niter != -1
1903       && ((unsigned HOST_WIDE_INT) estimated_niter
1904           < MAX (th, (unsigned) min_profitable_estimate)))
1905     {
1906       if (dump_enabled_p ())
1907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908                          "not vectorized: estimated iteration count too "
1909                          "small.\n");
1910       if (dump_enabled_p ())
1911         dump_printf_loc (MSG_NOTE, vect_location,
1912                          "not vectorized: estimated iteration count smaller "
1913                          "than specified loop bound parameter or minimum "
1914                          "profitable iterations (whichever is more "
1915                          "conservative).\n");
1916       return -1;
1917     }
1918
1919   return 1;
1920 }
1921
1922 static opt_result
1923 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1924                            vec<data_reference_p> *datarefs,
1925                            unsigned int *n_stmts)
1926 {
1927   *n_stmts = 0;
1928   for (unsigned i = 0; i < loop->num_nodes; i++)
1929     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930          !gsi_end_p (gsi); gsi_next (&gsi))
1931       {
1932         gimple *stmt = gsi_stmt (gsi);
1933         if (is_gimple_debug (stmt))
1934           continue;
1935         ++(*n_stmts);
1936         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1937                                                         NULL, 0);
1938         if (!res)
1939           {
1940             if (is_gimple_call (stmt) && loop->safelen)
1941               {
1942                 tree fndecl = gimple_call_fndecl (stmt), op;
1943                 if (fndecl != NULL_TREE)
1944                   {
1945                     cgraph_node *node = cgraph_node::get (fndecl);
1946                     if (node != NULL && node->simd_clones != NULL)
1947                       {
1948                         unsigned int j, n = gimple_call_num_args (stmt);
1949                         for (j = 0; j < n; j++)
1950                           {
1951                             op = gimple_call_arg (stmt, j);
1952                             if (DECL_P (op)
1953                                 || (REFERENCE_CLASS_P (op)
1954                                     && get_base_address (op)))
1955                               break;
1956                           }
1957                         op = gimple_call_lhs (stmt);
1958                         /* Ignore #pragma omp declare simd functions
1959                            if they don't have data references in the
1960                            call stmt itself.  */
1961                         if (j == n
1962                             && !(op
1963                                  && (DECL_P (op)
1964                                      || (REFERENCE_CLASS_P (op)
1965                                          && get_base_address (op)))))
1966                           continue;
1967                       }
1968                   }
1969               }
1970             return res;
1971           }
1972         /* If dependence analysis will give up due to the limit on the
1973            number of datarefs stop here and fail fatally.  */
1974         if (datarefs->length ()
1975             > (unsigned)param_loop_max_datarefs_for_datadeps)
1976           return opt_result::failure_at (stmt, "exceeded param "
1977                                          "loop-max-datarefs-for-datadeps\n");
1978       }
1979   return opt_result::success ();
1980 }
1981
1982 /* Look for SLP-only access groups and turn each individual access into its own
1983    group.  */
1984 static void
1985 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1986 {
1987   unsigned int i;
1988   struct data_reference *dr;
1989
1990   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1991
1992   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1993   FOR_EACH_VEC_ELT (datarefs, i, dr)
1994     {
1995       gcc_assert (DR_REF (dr));
1996       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1997
1998       /* Check if the load is a part of an interleaving chain.  */
1999       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000         {
2001           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2002           unsigned int group_size = DR_GROUP_SIZE (first_element);
2003
2004           /* Check if SLP-only groups.  */
2005           if (!STMT_SLP_TYPE (stmt_info)
2006               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2007             {
2008               /* Dissolve the group.  */
2009               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2010
2011               stmt_vec_info vinfo = first_element;
2012               while (vinfo)
2013                 {
2014                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2015                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2016                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2017                   DR_GROUP_SIZE (vinfo) = 1;
2018                   if (STMT_VINFO_STRIDED_P (first_element))
2019                     DR_GROUP_GAP (vinfo) = 0;
2020                   else
2021                     DR_GROUP_GAP (vinfo) = group_size - 1;
2022                   vinfo = next;
2023                 }
2024             }
2025         }
2026     }
2027 }
2028
2029 /* Determine if operating on full vectors for LOOP_VINFO might leave
2030    some scalar iterations still to do.  If so, decide how we should
2031    handle those scalar iterations.  The possibilities are:
2032
2033    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2034        In this case:
2035
2036          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2037          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2038          LOOP_VINFO_PEELING_FOR_NITER == false
2039
2040    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2041        to handle the remaining scalar iterations.  In this case:
2042
2043          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2044          LOOP_VINFO_PEELING_FOR_NITER == true
2045
2046        There are two choices:
2047
2048        (2a) Consider vectorizing the epilogue loop at the same VF as the
2049             main loop, but using partial vectors instead of full vectors.
2050             In this case:
2051
2052               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2053
2054        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2055             In this case:
2056
2057               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2058
2059    When FOR_EPILOGUE_P is true, make this determination based on the
2060    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2061    based on the assumption that LOOP_VINFO is the main loop.  The caller
2062    has made sure that the number of iterations is set appropriately for
2063    this value of FOR_EPILOGUE_P.  */
2064
2065 opt_result
2066 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2067                                             bool for_epilogue_p)
2068 {
2069   /* Determine whether there would be any scalar iterations left over.  */
2070   bool need_peeling_or_partial_vectors_p
2071     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2072
2073   /* Decide whether to vectorize the loop with partial vectors.  */
2074   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2075   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2076   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2077       && need_peeling_or_partial_vectors_p)
2078     {
2079       /* For partial-vector-usage=1, try to push the handling of partial
2080          vectors to the epilogue, with the main loop continuing to operate
2081          on full vectors.
2082
2083          ??? We could then end up failing to use partial vectors if we
2084          decide to peel iterations into a prologue, and if the main loop
2085          then ends up processing fewer than VF iterations.  */
2086       if (param_vect_partial_vector_usage == 1
2087           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2088           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2089         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2090       else
2091         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2092     }
2093
2094   if (dump_enabled_p ())
2095     {
2096       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2097         dump_printf_loc (MSG_NOTE, vect_location,
2098                          "operating on partial vectors%s.\n",
2099                          for_epilogue_p ? " for epilogue loop" : "");
2100       else
2101         dump_printf_loc (MSG_NOTE, vect_location,
2102                          "operating only on full vectors%s.\n",
2103                          for_epilogue_p ? " for epilogue loop" : "");
2104     }
2105
2106   if (for_epilogue_p)
2107     {
2108       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2109       gcc_assert (orig_loop_vinfo);
2110       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2112                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2113     }
2114
2115   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2116       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2117     {
2118       /* Check that the loop processes at least one full vector.  */
2119       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2120       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2121       if (known_lt (wi::to_widest (scalar_niters), vf))
2122         return opt_result::failure_at (vect_location,
2123                                        "loop does not have enough iterations"
2124                                        " to support vectorization.\n");
2125
2126       /* If we need to peel an extra epilogue iteration to handle data
2127          accesses with gaps, check that there are enough scalar iterations
2128          available.
2129
2130          The check above is redundant with this one when peeling for gaps,
2131          but the distinction is useful for diagnostics.  */
2132       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2133       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2135         return opt_result::failure_at (vect_location,
2136                                        "loop does not have enough iterations"
2137                                        " to support peeling for gaps.\n");
2138     }
2139
2140   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2141     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2142        && need_peeling_or_partial_vectors_p);
2143
2144   return opt_result::success ();
2145 }
2146
2147 /* Function vect_analyze_loop_2.
2148
2149    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2150    for it.  The different analyses will record information in the
2151    loop_vec_info struct.  */
2152 static opt_result
2153 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2154 {
2155   opt_result ok = opt_result::success ();
2156   int res;
2157   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2158   poly_uint64 min_vf = 2;
2159   loop_vec_info orig_loop_vinfo = NULL;
2160
2161   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2162      loop_vec_info of the first vectorized loop.  */
2163   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2164     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165   else
2166     orig_loop_vinfo = loop_vinfo;
2167   gcc_assert (orig_loop_vinfo);
2168
2169   /* The first group of checks is independent of the vector size.  */
2170   fatal = true;
2171
2172   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2173       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2174     return opt_result::failure_at (vect_location,
2175                                    "not vectorized: simd if(0)\n");
2176
2177   /* Find all data references in the loop (which correspond to vdefs/vuses)
2178      and analyze their evolution in the loop.  */
2179
2180   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2181
2182   /* Gather the data references and count stmts in the loop.  */
2183   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2184     {
2185       opt_result res
2186         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2187                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2188                                      n_stmts);
2189       if (!res)
2190         {
2191           if (dump_enabled_p ())
2192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                              "not vectorized: loop contains function "
2194                              "calls or data references that cannot "
2195                              "be analyzed\n");
2196           return res;
2197         }
2198       loop_vinfo->shared->save_datarefs ();
2199     }
2200   else
2201     loop_vinfo->shared->check_datarefs ();
2202
2203   /* Analyze the data references and also adjust the minimal
2204      vectorization factor according to the loads and stores.  */
2205
2206   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2207   if (!ok)
2208     {
2209       if (dump_enabled_p ())
2210         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211                          "bad data references.\n");
2212       return ok;
2213     }
2214
2215   /* Classify all cross-iteration scalar data-flow cycles.
2216      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2217   vect_analyze_scalar_cycles (loop_vinfo);
2218
2219   vect_pattern_recog (loop_vinfo);
2220
2221   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2222
2223   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2224      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2225
2226   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2227   if (!ok)
2228     {
2229       if (dump_enabled_p ())
2230         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231                          "bad data access.\n");
2232       return ok;
2233     }
2234
2235   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2236
2237   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2238   if (!ok)
2239     {
2240       if (dump_enabled_p ())
2241         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242                          "unexpected pattern.\n");
2243       return ok;
2244     }
2245
2246   /* While the rest of the analysis below depends on it in some way.  */
2247   fatal = false;
2248
2249   /* Analyze data dependences between the data-refs in the loop
2250      and adjust the maximum vectorization factor according to
2251      the dependences.
2252      FORNOW: fail at the first data dependence that we encounter.  */
2253
2254   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2255   if (!ok)
2256     {
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                          "bad data dependence.\n");
2260       return ok;
2261     }
2262   if (max_vf != MAX_VECTORIZATION_FACTOR
2263       && maybe_lt (max_vf, min_vf))
2264     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2265   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266
2267   ok = vect_determine_vectorization_factor (loop_vinfo);
2268   if (!ok)
2269     {
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272                          "can't determine vectorization factor.\n");
2273       return ok;
2274     }
2275   if (max_vf != MAX_VECTORIZATION_FACTOR
2276       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2277     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2278
2279   /* Compute the scalar iteration cost.  */
2280   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2281
2282   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2283
2284   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2285   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2286   if (!ok)
2287     return ok;
2288
2289   /* If there are any SLP instances mark them as pure_slp.  */
2290   bool slp = vect_make_slp_decision (loop_vinfo);
2291   if (slp)
2292     {
2293       /* Find stmts that need to be both vectorized and SLPed.  */
2294       vect_detect_hybrid_slp (loop_vinfo);
2295
2296       /* Update the vectorization factor based on the SLP decision.  */
2297       vect_update_vf_for_slp (loop_vinfo);
2298
2299       /* Optimize the SLP graph with the vectorization factor fixed.  */
2300       vect_optimize_slp (loop_vinfo);
2301
2302       /* Gather the loads reachable from the SLP graph entries.  */
2303       vect_gather_slp_loads (loop_vinfo);
2304     }
2305
2306   bool saved_can_use_partial_vectors_p
2307     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2308
2309   /* We don't expect to have to roll back to anything other than an empty
2310      set of rgroups.  */
2311   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2312
2313   /* This is the point where we can re-start analysis with SLP forced off.  */
2314 start_over:
2315
2316   /* Now the vectorization factor is final.  */
2317   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2318   gcc_assert (known_ne (vectorization_factor, 0U));
2319
2320   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2321     {
2322       dump_printf_loc (MSG_NOTE, vect_location,
2323                        "vectorization_factor = ");
2324       dump_dec (MSG_NOTE, vectorization_factor);
2325       dump_printf (MSG_NOTE, ", niters = %wd\n",
2326                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2327     }
2328
2329   /* Analyze the alignment of the data-refs in the loop.
2330      Fail if a data reference is found that cannot be vectorized.  */
2331
2332   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2333   if (!ok)
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "bad data alignment.\n");
2338       return ok;
2339     }
2340
2341   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2342      It is important to call pruning after vect_analyze_data_ref_accesses,
2343      since we use grouping information gathered by interleaving analysis.  */
2344   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2345   if (!ok)
2346     return ok;
2347
2348   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2349      vectorization, since we do not want to add extra peeling or
2350      add versioning for alignment.  */
2351   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2352     /* This pass will decide on using loop versioning and/or loop peeling in
2353        order to enhance the alignment of data references in the loop.  */
2354     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2355   if (!ok)
2356     return ok;
2357
2358   if (slp)
2359     {
2360       /* Analyze operations in the SLP instances.  Note this may
2361          remove unsupported SLP instances which makes the above
2362          SLP kind detection invalid.  */
2363       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2364       vect_slp_analyze_operations (loop_vinfo);
2365       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2366         {
2367           ok = opt_result::failure_at (vect_location,
2368                                        "unsupported SLP instances\n");
2369           goto again;
2370         }
2371
2372       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2373       slp_tree load_node, slp_root;
2374       unsigned i, x;
2375       slp_instance instance;
2376       bool can_use_lanes = true;
2377       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2378         {
2379           slp_root = SLP_INSTANCE_TREE (instance);
2380           int group_size = SLP_TREE_LANES (slp_root);
2381           tree vectype = SLP_TREE_VECTYPE (slp_root);
2382           bool loads_permuted = false;
2383           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2384             {
2385               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2386                 continue;
2387               unsigned j;
2388               stmt_vec_info load_info;
2389               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2390                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2391                   {
2392                     loads_permuted = true;
2393                     break;
2394                   }
2395             }
2396
2397           /* If the loads and stores can be handled with load/store-lane
2398              instructions record it and move on to the next instance.  */
2399           if (loads_permuted
2400               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2401               && vect_store_lanes_supported (vectype, group_size, false))
2402             {
2403               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2404                 {
2405                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2406                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2407                   /* Use SLP for strided accesses (or if we can't
2408                      load-lanes).  */
2409                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2410                       || ! vect_load_lanes_supported
2411                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2412                              DR_GROUP_SIZE (stmt_vinfo), false))
2413                     break;
2414                 }
2415
2416               can_use_lanes
2417                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2418
2419               if (can_use_lanes && dump_enabled_p ())
2420                 dump_printf_loc (MSG_NOTE, vect_location,
2421                                  "SLP instance %p can use load/store-lanes\n",
2422                                  instance);
2423             }
2424           else
2425             {
2426               can_use_lanes = false;
2427               break;
2428             }
2429         }
2430
2431       /* If all SLP instances can use load/store-lanes abort SLP and try again
2432          with SLP disabled.  */
2433       if (can_use_lanes)
2434         {
2435           ok = opt_result::failure_at (vect_location,
2436                                        "Built SLP cancelled: can use "
2437                                        "load/store-lanes\n");
2438           if (dump_enabled_p ())
2439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2440                              "Built SLP cancelled: all SLP instances support "
2441                              "load/store-lanes\n");
2442           goto again;
2443         }
2444     }
2445
2446   /* Dissolve SLP-only groups.  */
2447   vect_dissolve_slp_only_groups (loop_vinfo);
2448
2449   /* Scan all the remaining operations in the loop that are not subject
2450      to SLP and make sure they are vectorizable.  */
2451   ok = vect_analyze_loop_operations (loop_vinfo);
2452   if (!ok)
2453     {
2454       if (dump_enabled_p ())
2455         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2456                          "bad operation or unsupported loop bound.\n");
2457       return ok;
2458     }
2459
2460   /* For now, we don't expect to mix both masking and length approaches for one
2461      loop, disable it if both are recorded.  */
2462   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2463       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2464       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2465     {
2466       if (dump_enabled_p ())
2467         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468                          "can't vectorize a loop with partial vectors"
2469                          " because we don't expect to mix different"
2470                          " approaches with partial vectors for the"
2471                          " same loop.\n");
2472       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2473     }
2474
2475   /* If we still have the option of using partial vectors,
2476      check whether we can generate the necessary loop controls.  */
2477   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2478       && !vect_verify_full_masking (loop_vinfo)
2479       && !vect_verify_loop_lens (loop_vinfo))
2480     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2481
2482   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2483      to be able to handle fewer than VF scalars, or needs to have a lower VF
2484      than the main loop.  */
2485   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2486       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2487       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2488                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2489     return opt_result::failure_at (vect_location,
2490                                    "Vectorization factor too high for"
2491                                    " epilogue loop.\n");
2492
2493   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2494      assuming that the loop will be used as a main loop.  We will redo
2495      this analysis later if we instead decide to use the loop as an
2496      epilogue loop.  */
2497   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2498   if (!ok)
2499     return ok;
2500
2501   /* Check the costings of the loop make vectorizing worthwhile.  */
2502   res = vect_analyze_loop_costing (loop_vinfo);
2503   if (res < 0)
2504     {
2505       ok = opt_result::failure_at (vect_location,
2506                                    "Loop costings may not be worthwhile.\n");
2507       goto again;
2508     }
2509   if (!res)
2510     return opt_result::failure_at (vect_location,
2511                                    "Loop costings not worthwhile.\n");
2512
2513   /* If an epilogue loop is required make sure we can create one.  */
2514   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2515       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2516     {
2517       if (dump_enabled_p ())
2518         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2519       if (!vect_can_advance_ivs_p (loop_vinfo)
2520           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2521                                            single_exit (LOOP_VINFO_LOOP
2522                                                          (loop_vinfo))))
2523         {
2524           ok = opt_result::failure_at (vect_location,
2525                                        "not vectorized: can't create required "
2526                                        "epilog loop\n");
2527           goto again;
2528         }
2529     }
2530
2531   /* During peeling, we need to check if number of loop iterations is
2532      enough for both peeled prolog loop and vector loop.  This check
2533      can be merged along with threshold check of loop versioning, so
2534      increase threshold for this case if necessary.
2535
2536      If we are analyzing an epilogue we still want to check what its
2537      versioning threshold would be.  If we decide to vectorize the epilogues we
2538      will want to use the lowest versioning threshold of all epilogues and main
2539      loop.  This will enable us to enter a vectorized epilogue even when
2540      versioning the loop.  We can't simply check whether the epilogue requires
2541      versioning though since we may have skipped some versioning checks when
2542      analyzing the epilogue.  For instance, checks for alias versioning will be
2543      skipped when dealing with epilogues as we assume we already checked them
2544      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2545   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2546     {
2547       poly_uint64 niters_th = 0;
2548       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2549
2550       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2551         {
2552           /* Niters for peeled prolog loop.  */
2553           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2554             {
2555               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2556               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2557               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2558             }
2559           else
2560             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2561         }
2562
2563       /* Niters for at least one iteration of vectorized loop.  */
2564       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2565         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2566       /* One additional iteration because of peeling for gap.  */
2567       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2568         niters_th += 1;
2569
2570       /*  Use the same condition as vect_transform_loop to decide when to use
2571           the cost to determine a versioning threshold.  */
2572       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2573           && ordered_p (th, niters_th))
2574         niters_th = ordered_max (poly_uint64 (th), niters_th);
2575
2576       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2577     }
2578
2579   gcc_assert (known_eq (vectorization_factor,
2580                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2581
2582   /* Ok to vectorize!  */
2583   return opt_result::success ();
2584
2585 again:
2586   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2587   gcc_assert (!ok);
2588
2589   /* Try again with SLP forced off but if we didn't do any SLP there is
2590      no point in re-trying.  */
2591   if (!slp)
2592     return ok;
2593
2594   /* If there are reduction chains re-trying will fail anyway.  */
2595   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2596     return ok;
2597
2598   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2599      via interleaving or lane instructions.  */
2600   slp_instance instance;
2601   slp_tree node;
2602   unsigned i, j;
2603   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2604     {
2605       stmt_vec_info vinfo;
2606       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2607       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2608         continue;
2609       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2610       unsigned int size = DR_GROUP_SIZE (vinfo);
2611       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2612       if (! vect_store_lanes_supported (vectype, size, false)
2613          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2614          && ! vect_grouped_store_supported (vectype, size))
2615         return opt_result::failure_at (vinfo->stmt,
2616                                        "unsupported grouped store\n");
2617       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2618         {
2619           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2620           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2621           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2622           size = DR_GROUP_SIZE (vinfo);
2623           vectype = STMT_VINFO_VECTYPE (vinfo);
2624           if (! vect_load_lanes_supported (vectype, size, false)
2625               && ! vect_grouped_load_supported (vectype, single_element_p,
2626                                                 size))
2627             return opt_result::failure_at (vinfo->stmt,
2628                                            "unsupported grouped load\n");
2629         }
2630     }
2631
2632   if (dump_enabled_p ())
2633     dump_printf_loc (MSG_NOTE, vect_location,
2634                      "re-trying with SLP disabled\n");
2635
2636   /* Roll back state appropriately.  No SLP this time.  */
2637   slp = false;
2638   /* Restore vectorization factor as it were without SLP.  */
2639   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2640   /* Free the SLP instances.  */
2641   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2642     vect_free_slp_instance (instance);
2643   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2644   /* Reset SLP type to loop_vect on all stmts.  */
2645   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2646     {
2647       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2648       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2649            !gsi_end_p (si); gsi_next (&si))
2650         {
2651           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2652           STMT_SLP_TYPE (stmt_info) = loop_vect;
2653           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2654               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2655             {
2656               /* vectorizable_reduction adjusts reduction stmt def-types,
2657                  restore them to that of the PHI.  */
2658               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2659                 = STMT_VINFO_DEF_TYPE (stmt_info);
2660               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2661                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2662                 = STMT_VINFO_DEF_TYPE (stmt_info);
2663             }
2664         }
2665       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2666            !gsi_end_p (si); gsi_next (&si))
2667         {
2668           if (is_gimple_debug (gsi_stmt (si)))
2669             continue;
2670           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2671           STMT_SLP_TYPE (stmt_info) = loop_vect;
2672           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2673             {
2674               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2675               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2676               STMT_SLP_TYPE (stmt_info) = loop_vect;
2677               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2678                    !gsi_end_p (pi); gsi_next (&pi))
2679                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2680                   = loop_vect;
2681             }
2682         }
2683     }
2684   /* Free optimized alias test DDRS.  */
2685   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2686   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2687   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2688   /* Reset target cost data.  */
2689   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2690   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2691     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2692   /* Reset accumulated rgroup information.  */
2693   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2694   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2695   /* Reset assorted flags.  */
2696   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2697   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2698   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2699   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2700   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2701     = saved_can_use_partial_vectors_p;
2702
2703   goto start_over;
2704 }
2705
2706 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2707    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2708    OLD_LOOP_VINFO is better unless something specifically indicates
2709    otherwise.
2710
2711    Note that this deliberately isn't a partial order.  */
2712
2713 static bool
2714 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2715                           loop_vec_info old_loop_vinfo)
2716 {
2717   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2718   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2719
2720   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2721   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2722
2723   /* Always prefer a VF of loop->simdlen over any other VF.  */
2724   if (loop->simdlen)
2725     {
2726       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2727       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2728       if (new_simdlen_p != old_simdlen_p)
2729         return new_simdlen_p;
2730     }
2731
2732   /* Limit the VFs to what is likely to be the maximum number of iterations,
2733      to handle cases in which at least one loop_vinfo is fully-masked.  */
2734   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2735   if (estimated_max_niter != -1)
2736     {
2737       if (known_le (estimated_max_niter, new_vf))
2738         new_vf = estimated_max_niter;
2739       if (known_le (estimated_max_niter, old_vf))
2740         old_vf = estimated_max_niter;
2741     }
2742
2743   /* Check whether the (fractional) cost per scalar iteration is lower
2744      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2745   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2746                              * poly_widest_int (old_vf));
2747   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2748                              * poly_widest_int (new_vf));
2749   if (maybe_lt (rel_old, rel_new))
2750     {
2751       /* When old_loop_vinfo uses a variable vectorization factor,
2752          we know that it has a lower cost for at least one runtime VF.
2753          However, we don't know how likely that VF is.
2754
2755          One option would be to compare the costs for the estimated VFs.
2756          The problem is that that can put too much pressure on the cost
2757          model.  E.g. if the estimated VF is also the lowest possible VF,
2758          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2759          for the estimated VF, we'd then choose new_loop_vinfo even
2760          though (a) new_loop_vinfo might not actually be better than
2761          old_loop_vinfo for that VF and (b) it would be significantly
2762          worse at larger VFs.
2763
2764          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2765          no more expensive than old_loop_vinfo even after doubling the
2766          estimated old_loop_vinfo VF.  For all but trivial loops, this
2767          ensures that we only pick new_loop_vinfo if it is significantly
2768          better than old_loop_vinfo at the estimated VF.  */
2769       if (rel_new.is_constant ())
2770         return false;
2771
2772       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2773       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2774       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2775                                       * widest_int (old_estimated_vf));
2776       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2777                                       * widest_int (new_estimated_vf));
2778       return estimated_rel_new * 2 <= estimated_rel_old;
2779     }
2780   if (known_lt (rel_new, rel_old))
2781     return true;
2782
2783   /* If there's nothing to choose between the loop bodies, see whether
2784      there's a difference in the prologue and epilogue costs.  */
2785   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2786     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2787
2788   return false;
2789 }
2790
2791 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2792    true if we should.  */
2793
2794 static bool
2795 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2796                         loop_vec_info old_loop_vinfo)
2797 {
2798   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2799     return false;
2800
2801   if (dump_enabled_p ())
2802     dump_printf_loc (MSG_NOTE, vect_location,
2803                      "***** Preferring vector mode %s to vector mode %s\n",
2804                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2805                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2806   return true;
2807 }
2808
2809 /* Function vect_analyze_loop.
2810
2811    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2812    for it.  The different analyses will record information in the
2813    loop_vec_info struct.  */
2814 opt_loop_vec_info
2815 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2816 {
2817   auto_vector_modes vector_modes;
2818
2819   /* Autodetect first vector size we try.  */
2820   unsigned int autovec_flags
2821     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2822                                                     loop->simdlen != 0);
2823   unsigned int mode_i = 0;
2824
2825   DUMP_VECT_SCOPE ("analyze_loop_nest");
2826
2827   if (loop_outer (loop)
2828       && loop_vec_info_for_loop (loop_outer (loop))
2829       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2830     return opt_loop_vec_info::failure_at (vect_location,
2831                                           "outer-loop already vectorized.\n");
2832
2833   if (!find_loop_nest (loop, &shared->loop_nest))
2834     return opt_loop_vec_info::failure_at
2835       (vect_location,
2836        "not vectorized: loop nest containing two or more consecutive inner"
2837        " loops cannot be vectorized\n");
2838
2839   unsigned n_stmts = 0;
2840   machine_mode autodetected_vector_mode = VOIDmode;
2841   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2842   machine_mode next_vector_mode = VOIDmode;
2843   poly_uint64 lowest_th = 0;
2844   unsigned vectorized_loops = 0;
2845   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2846                              && !unlimited_cost_model (loop));
2847
2848   bool vect_epilogues = false;
2849   opt_result res = opt_result::success ();
2850   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2851   while (1)
2852     {
2853       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2854       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2855       if (!loop_vinfo)
2856         {
2857           if (dump_enabled_p ())
2858             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2859                              "bad loop form.\n");
2860           gcc_checking_assert (first_loop_vinfo == NULL);
2861           return loop_vinfo;
2862         }
2863       loop_vinfo->vector_mode = next_vector_mode;
2864
2865       bool fatal = false;
2866
2867       /* When pick_lowest_cost_p is true, we should in principle iterate
2868          over all the loop_vec_infos that LOOP_VINFO could replace and
2869          try to vectorize LOOP_VINFO under the same conditions.
2870          E.g. when trying to replace an epilogue loop, we should vectorize
2871          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2872          to replace the main loop, we should vectorize LOOP_VINFO as a main
2873          loop too.
2874
2875          However, autovectorize_vector_modes is usually sorted as follows:
2876
2877          - Modes that naturally produce lower VFs usually follow modes that
2878            naturally produce higher VFs.
2879
2880          - When modes naturally produce the same VF, maskable modes
2881            usually follow unmaskable ones, so that the maskable mode
2882            can be used to vectorize the epilogue of the unmaskable mode.
2883
2884          This order is preferred because it leads to the maximum
2885          epilogue vectorization opportunities.  Targets should only use
2886          a different order if they want to make wide modes available while
2887          disparaging them relative to earlier, smaller modes.  The assumption
2888          in that case is that the wider modes are more expensive in some
2889          way that isn't reflected directly in the costs.
2890
2891          There should therefore be few interesting cases in which
2892          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2893          treated as a standalone loop, and ends up being genuinely cheaper
2894          than FIRST_LOOP_VINFO.  */
2895       if (vect_epilogues)
2896         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2897
2898       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2899       if (mode_i == 0)
2900         autodetected_vector_mode = loop_vinfo->vector_mode;
2901       if (dump_enabled_p ())
2902         {
2903           if (res)
2904             dump_printf_loc (MSG_NOTE, vect_location,
2905                              "***** Analysis succeeded with vector mode %s\n",
2906                              GET_MODE_NAME (loop_vinfo->vector_mode));
2907           else
2908             dump_printf_loc (MSG_NOTE, vect_location,
2909                              "***** Analysis failed with vector mode %s\n",
2910                              GET_MODE_NAME (loop_vinfo->vector_mode));
2911         }
2912
2913       loop->aux = NULL;
2914
2915       if (!fatal)
2916         while (mode_i < vector_modes.length ()
2917                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2918           {
2919             if (dump_enabled_p ())
2920               dump_printf_loc (MSG_NOTE, vect_location,
2921                                "***** The result for vector mode %s would"
2922                                " be the same\n",
2923                                GET_MODE_NAME (vector_modes[mode_i]));
2924             mode_i += 1;
2925           }
2926
2927       if (res)
2928         {
2929           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2930           vectorized_loops++;
2931
2932           /* Once we hit the desired simdlen for the first time,
2933              discard any previous attempts.  */
2934           if (simdlen
2935               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2936             {
2937               delete first_loop_vinfo;
2938               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2939               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2940               simdlen = 0;
2941             }
2942           else if (pick_lowest_cost_p && first_loop_vinfo)
2943             {
2944               /* Keep trying to roll back vectorization attempts while the
2945                  loop_vec_infos they produced were worse than this one.  */
2946               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2947               while (!vinfos.is_empty ()
2948                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2949                 {
2950                   gcc_assert (vect_epilogues);
2951                   delete vinfos.pop ();
2952                 }
2953               if (vinfos.is_empty ()
2954                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2955                 {
2956                   delete first_loop_vinfo;
2957                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2958                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2959                 }
2960             }
2961
2962           if (first_loop_vinfo == NULL)
2963             {
2964               first_loop_vinfo = loop_vinfo;
2965               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2966             }
2967           else if (vect_epilogues
2968                    /* For now only allow one epilogue loop.  */
2969                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2970             {
2971               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2972               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2973               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2974                           || maybe_ne (lowest_th, 0U));
2975               /* Keep track of the known smallest versioning
2976                  threshold.  */
2977               if (ordered_p (lowest_th, th))
2978                 lowest_th = ordered_min (lowest_th, th);
2979             }
2980           else
2981             {
2982               delete loop_vinfo;
2983               loop_vinfo = opt_loop_vec_info::success (NULL);
2984             }
2985
2986           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2987              enabled, SIMDUID is not set, it is the innermost loop and we have
2988              either already found the loop's SIMDLEN or there was no SIMDLEN to
2989              begin with.
2990              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2991           vect_epilogues = (!simdlen
2992                             && loop->inner == NULL
2993                             && param_vect_epilogues_nomask
2994                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2995                             && !loop->simduid
2996                             /* For now only allow one epilogue loop, but allow
2997                                pick_lowest_cost_p to replace it.  */
2998                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2999                                 || pick_lowest_cost_p));
3000
3001           /* Commit to first_loop_vinfo if we have no reason to try
3002              alternatives.  */
3003           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3004             break;
3005         }
3006       else
3007         {
3008           delete loop_vinfo;
3009           loop_vinfo = opt_loop_vec_info::success (NULL);
3010           if (fatal)
3011             {
3012               gcc_checking_assert (first_loop_vinfo == NULL);
3013               break;
3014             }
3015         }
3016
3017       /* Handle the case that the original loop can use partial
3018          vectorization, but want to only adopt it for the epilogue.
3019          The retry should be in the same mode as original.  */
3020       if (vect_epilogues
3021           && loop_vinfo
3022           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3023         {
3024           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3025                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3026           if (dump_enabled_p ())
3027             dump_printf_loc (MSG_NOTE, vect_location,
3028                              "***** Re-trying analysis with same vector mode"
3029                              " %s for epilogue with partial vectors.\n",
3030                              GET_MODE_NAME (loop_vinfo->vector_mode));
3031           continue;
3032         }
3033
3034       if (mode_i < vector_modes.length ()
3035           && VECTOR_MODE_P (autodetected_vector_mode)
3036           && (related_vector_mode (vector_modes[mode_i],
3037                                    GET_MODE_INNER (autodetected_vector_mode))
3038               == autodetected_vector_mode)
3039           && (related_vector_mode (autodetected_vector_mode,
3040                                    GET_MODE_INNER (vector_modes[mode_i]))
3041               == vector_modes[mode_i]))
3042         {
3043           if (dump_enabled_p ())
3044             dump_printf_loc (MSG_NOTE, vect_location,
3045                              "***** Skipping vector mode %s, which would"
3046                              " repeat the analysis for %s\n",
3047                              GET_MODE_NAME (vector_modes[mode_i]),
3048                              GET_MODE_NAME (autodetected_vector_mode));
3049           mode_i += 1;
3050         }
3051
3052       if (mode_i == vector_modes.length ()
3053           || autodetected_vector_mode == VOIDmode)
3054         break;
3055
3056       /* Try the next biggest vector size.  */
3057       next_vector_mode = vector_modes[mode_i++];
3058       if (dump_enabled_p ())
3059         dump_printf_loc (MSG_NOTE, vect_location,
3060                          "***** Re-trying analysis with vector mode %s\n",
3061                          GET_MODE_NAME (next_vector_mode));
3062     }
3063
3064   if (first_loop_vinfo)
3065     {
3066       loop->aux = (loop_vec_info) first_loop_vinfo;
3067       if (dump_enabled_p ())
3068         dump_printf_loc (MSG_NOTE, vect_location,
3069                          "***** Choosing vector mode %s\n",
3070                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3071       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3072       return first_loop_vinfo;
3073     }
3074
3075   return opt_loop_vec_info::propagate_failure (res);
3076 }
3077
3078 /* Return true if there is an in-order reduction function for CODE, storing
3079    it in *REDUC_FN if so.  */
3080
3081 static bool
3082 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3083 {
3084   switch (code)
3085     {
3086     case PLUS_EXPR:
3087       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3088       return true;
3089
3090     default:
3091       return false;
3092     }
3093 }
3094
3095 /* Function reduction_fn_for_scalar_code
3096
3097    Input:
3098    CODE - tree_code of a reduction operations.
3099
3100    Output:
3101    REDUC_FN - the corresponding internal function to be used to reduce the
3102       vector of partial results into a single scalar result, or IFN_LAST
3103       if the operation is a supported reduction operation, but does not have
3104       such an internal function.
3105
3106    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3107
3108 static bool
3109 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3110 {
3111   switch (code)
3112     {
3113       case MAX_EXPR:
3114         *reduc_fn = IFN_REDUC_MAX;
3115         return true;
3116
3117       case MIN_EXPR:
3118         *reduc_fn = IFN_REDUC_MIN;
3119         return true;
3120
3121       case PLUS_EXPR:
3122         *reduc_fn = IFN_REDUC_PLUS;
3123         return true;
3124
3125       case BIT_AND_EXPR:
3126         *reduc_fn = IFN_REDUC_AND;
3127         return true;
3128
3129       case BIT_IOR_EXPR:
3130         *reduc_fn = IFN_REDUC_IOR;
3131         return true;
3132
3133       case BIT_XOR_EXPR:
3134         *reduc_fn = IFN_REDUC_XOR;
3135         return true;
3136
3137       case MULT_EXPR:
3138       case MINUS_EXPR:
3139         *reduc_fn = IFN_LAST;
3140         return true;
3141
3142       default:
3143        return false;
3144     }
3145 }
3146
3147 /* If there is a neutral value X such that SLP reduction NODE would not
3148    be affected by the introduction of additional X elements, return that X,
3149    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3150    is the vector type that would hold element X.  REDUC_CHAIN is true if
3151    the SLP statements perform a single reduction, false if each statement
3152    performs an independent reduction.  */
3153
3154 static tree
3155 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3156                               tree_code code, bool reduc_chain)
3157 {
3158   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3159   stmt_vec_info stmt_vinfo = stmts[0];
3160   tree scalar_type = TREE_TYPE (vector_type);
3161   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3162   gcc_assert (loop);
3163
3164   switch (code)
3165     {
3166     case WIDEN_SUM_EXPR:
3167     case DOT_PROD_EXPR:
3168     case SAD_EXPR:
3169     case PLUS_EXPR:
3170     case MINUS_EXPR:
3171     case BIT_IOR_EXPR:
3172     case BIT_XOR_EXPR:
3173       return build_zero_cst (scalar_type);
3174
3175     case MULT_EXPR:
3176       return build_one_cst (scalar_type);
3177
3178     case BIT_AND_EXPR:
3179       return build_all_ones_cst (scalar_type);
3180
3181     case MAX_EXPR:
3182     case MIN_EXPR:
3183       /* For MIN/MAX the initial values are neutral.  A reduction chain
3184          has only a single initial value, so that value is neutral for
3185          all statements.  */
3186       if (reduc_chain)
3187         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3188                                       loop_preheader_edge (loop));
3189       return NULL_TREE;
3190
3191     default:
3192       return NULL_TREE;
3193     }
3194 }
3195
3196 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3197    STMT is printed with a message MSG. */
3198
3199 static void
3200 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3201 {
3202   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3203 }
3204
3205 /* Return true if we need an in-order reduction for operation CODE
3206    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3207    overflow must wrap.  */
3208
3209 bool
3210 needs_fold_left_reduction_p (tree type, tree_code code)
3211 {
3212   /* CHECKME: check for !flag_finite_math_only too?  */
3213   if (SCALAR_FLOAT_TYPE_P (type))
3214     switch (code)
3215       {
3216       case MIN_EXPR:
3217       case MAX_EXPR:
3218         return false;
3219
3220       default:
3221         return !flag_associative_math;
3222       }
3223
3224   if (INTEGRAL_TYPE_P (type))
3225     {
3226       if (!operation_no_trapping_overflow (type, code))
3227         return true;
3228       return false;
3229     }
3230
3231   if (SAT_FIXED_POINT_TYPE_P (type))
3232     return true;
3233
3234   return false;
3235 }
3236
3237 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3238    has a handled computation expression.  Store the main reduction
3239    operation in *CODE.  */
3240
3241 static bool
3242 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3243                       tree loop_arg, enum tree_code *code,
3244                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3245 {
3246   auto_bitmap visited;
3247   tree lookfor = PHI_RESULT (phi);
3248   ssa_op_iter curri;
3249   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3250   while (USE_FROM_PTR (curr) != loop_arg)
3251     curr = op_iter_next_use (&curri);
3252   curri.i = curri.numops;
3253   do
3254     {
3255       path.safe_push (std::make_pair (curri, curr));
3256       tree use = USE_FROM_PTR (curr);
3257       if (use == lookfor)
3258         break;
3259       gimple *def = SSA_NAME_DEF_STMT (use);
3260       if (gimple_nop_p (def)
3261           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3262         {
3263 pop:
3264           do
3265             {
3266               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3267               curri = x.first;
3268               curr = x.second;
3269               do
3270                 curr = op_iter_next_use (&curri);
3271               /* Skip already visited or non-SSA operands (from iterating
3272                  over PHI args).  */
3273               while (curr != NULL_USE_OPERAND_P
3274                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3275                          || ! bitmap_set_bit (visited,
3276                                               SSA_NAME_VERSION
3277                                                 (USE_FROM_PTR (curr)))));
3278             }
3279           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3280           if (curr == NULL_USE_OPERAND_P)
3281             break;
3282         }
3283       else
3284         {
3285           if (gimple_code (def) == GIMPLE_PHI)
3286             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3287           else
3288             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3289           while (curr != NULL_USE_OPERAND_P
3290                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3291                      || ! bitmap_set_bit (visited,
3292                                           SSA_NAME_VERSION
3293                                             (USE_FROM_PTR (curr)))))
3294             curr = op_iter_next_use (&curri);
3295           if (curr == NULL_USE_OPERAND_P)
3296             goto pop;
3297         }
3298     }
3299   while (1);
3300   if (dump_file && (dump_flags & TDF_DETAILS))
3301     {
3302       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3303       unsigned i;
3304       std::pair<ssa_op_iter, use_operand_p> *x;
3305       FOR_EACH_VEC_ELT (path, i, x)
3306         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3307       dump_printf (MSG_NOTE, "\n");
3308     }
3309
3310   /* Check whether the reduction path detected is valid.  */
3311   bool fail = path.length () == 0;
3312   bool neg = false;
3313   int sign = -1;
3314   *code = ERROR_MARK;
3315   for (unsigned i = 1; i < path.length (); ++i)
3316     {
3317       gimple *use_stmt = USE_STMT (path[i].second);
3318       tree op = USE_FROM_PTR (path[i].second);
3319       if (! is_gimple_assign (use_stmt)
3320           /* The following make sure we can compute the operand index
3321              easily plus it mostly disallows chaining via COND_EXPR condition
3322              operands.  */
3323           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3324               && (gimple_num_ops (use_stmt) <= 2
3325                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3326               && (gimple_num_ops (use_stmt) <= 3
3327                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3328         {
3329           fail = true;
3330           break;
3331         }
3332       /* Check there's only a single stmt the op is used on.  For the
3333          not value-changing tail and the last stmt allow out-of-loop uses.
3334          ???  We could relax this and handle arbitrary live stmts by
3335          forcing a scalar epilogue for example.  */
3336       imm_use_iterator imm_iter;
3337       gimple *op_use_stmt;
3338       unsigned cnt = 0;
3339       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3340         if (!is_gimple_debug (op_use_stmt)
3341             && (*code != ERROR_MARK
3342                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3343           {
3344             /* We want to allow x + x but not x < 1 ? x : 2.  */
3345             if (is_gimple_assign (op_use_stmt)
3346                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3347               {
3348                 use_operand_p use_p;
3349                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3350                   cnt++;
3351               }
3352             else
3353               cnt++;
3354           }
3355       if (cnt != 1)
3356         {
3357           fail = true;
3358           break;
3359         }
3360       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3361       if (use_code == MINUS_EXPR)
3362         {
3363           use_code = PLUS_EXPR;
3364           /* Track whether we negate the reduction value each iteration.  */
3365           if (gimple_assign_rhs2 (use_stmt) == op)
3366             neg = ! neg;
3367         }
3368       if (CONVERT_EXPR_CODE_P (use_code)
3369           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3370                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3371         ;
3372       else if (*code == ERROR_MARK)
3373         {
3374           *code = use_code;
3375           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3376         }
3377       else if (use_code != *code)
3378         {
3379           fail = true;
3380           break;
3381         }
3382       else if ((use_code == MIN_EXPR
3383                 || use_code == MAX_EXPR)
3384                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3385         {
3386           fail = true;
3387           break;
3388         }
3389     }
3390   return ! fail && ! neg && *code != ERROR_MARK;
3391 }
3392
3393 bool
3394 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3395                       tree loop_arg, enum tree_code code)
3396 {
3397   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3398   enum tree_code code_;
3399   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3400           && code_ == code);
3401 }
3402
3403
3404
3405 /* Function vect_is_simple_reduction
3406
3407    (1) Detect a cross-iteration def-use cycle that represents a simple
3408    reduction computation.  We look for the following pattern:
3409
3410    loop_header:
3411      a1 = phi < a0, a2 >
3412      a3 = ...
3413      a2 = operation (a3, a1)
3414
3415    or
3416
3417    a3 = ...
3418    loop_header:
3419      a1 = phi < a0, a2 >
3420      a2 = operation (a3, a1)
3421
3422    such that:
3423    1. operation is commutative and associative and it is safe to
3424       change the order of the computation
3425    2. no uses for a2 in the loop (a2 is used out of the loop)
3426    3. no uses of a1 in the loop besides the reduction operation
3427    4. no uses of a1 outside the loop.
3428
3429    Conditions 1,4 are tested here.
3430    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3431
3432    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3433    nested cycles.
3434
3435    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3436    reductions:
3437
3438      a1 = phi < a0, a2 >
3439      inner loop (def of a3)
3440      a2 = phi < a3 >
3441
3442    (4) Detect condition expressions, ie:
3443      for (int i = 0; i < N; i++)
3444        if (a[i] < val)
3445         ret_val = a[i];
3446
3447 */
3448
3449 static stmt_vec_info
3450 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3451                           bool *double_reduc, bool *reduc_chain_p)
3452 {
3453   gphi *phi = as_a <gphi *> (phi_info->stmt);
3454   gimple *phi_use_stmt = NULL;
3455   imm_use_iterator imm_iter;
3456   use_operand_p use_p;
3457
3458   *double_reduc = false;
3459   *reduc_chain_p = false;
3460   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3461
3462   tree phi_name = PHI_RESULT (phi);
3463   /* ???  If there are no uses of the PHI result the inner loop reduction
3464      won't be detected as possibly double-reduction by vectorizable_reduction
3465      because that tries to walk the PHI arg from the preheader edge which
3466      can be constant.  See PR60382.  */
3467   if (has_zero_uses (phi_name))
3468     return NULL;
3469   class loop *loop = (gimple_bb (phi))->loop_father;
3470   unsigned nphi_def_loop_uses = 0;
3471   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3472     {
3473       gimple *use_stmt = USE_STMT (use_p);
3474       if (is_gimple_debug (use_stmt))
3475         continue;
3476
3477       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3478         {
3479           if (dump_enabled_p ())
3480             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481                              "intermediate value used outside loop.\n");
3482
3483           return NULL;
3484         }
3485
3486       nphi_def_loop_uses++;
3487       phi_use_stmt = use_stmt;
3488     }
3489
3490   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3491   if (TREE_CODE (latch_def) != SSA_NAME)
3492     {
3493       if (dump_enabled_p ())
3494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3495                          "reduction: not ssa_name: %T\n", latch_def);
3496       return NULL;
3497     }
3498
3499   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3500   if (!def_stmt_info
3501       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3502     return NULL;
3503
3504   bool nested_in_vect_loop
3505     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3506   unsigned nlatch_def_loop_uses = 0;
3507   auto_vec<gphi *, 3> lcphis;
3508   bool inner_loop_of_double_reduc = false;
3509   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3510     {
3511       gimple *use_stmt = USE_STMT (use_p);
3512       if (is_gimple_debug (use_stmt))
3513         continue;
3514       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3515         nlatch_def_loop_uses++;
3516       else
3517         {
3518           /* We can have more than one loop-closed PHI.  */
3519           lcphis.safe_push (as_a <gphi *> (use_stmt));
3520           if (nested_in_vect_loop
3521               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3522                   == vect_double_reduction_def))
3523             inner_loop_of_double_reduc = true;
3524         }
3525     }
3526
3527   /* If we are vectorizing an inner reduction we are executing that
3528      in the original order only in case we are not dealing with a
3529      double reduction.  */
3530   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3531     {
3532       if (dump_enabled_p ())
3533         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3534                         "detected nested cycle: ");
3535       return def_stmt_info;
3536     }
3537
3538   /* If this isn't a nested cycle or if the nested cycle reduction value
3539      is used ouside of the inner loop we cannot handle uses of the reduction
3540      value.  */
3541   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3542     {
3543       if (dump_enabled_p ())
3544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3545                          "reduction used in loop.\n");
3546       return NULL;
3547     }
3548
3549   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3550      defined in the inner loop.  */
3551   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3552     {
3553       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3554       if (gimple_phi_num_args (def_stmt) != 1
3555           || TREE_CODE (op1) != SSA_NAME)
3556         {
3557           if (dump_enabled_p ())
3558             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559                              "unsupported phi node definition.\n");
3560
3561           return NULL;
3562         }
3563
3564       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3565       if (gimple_bb (def1)
3566           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3567           && loop->inner
3568           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3569           && is_gimple_assign (def1)
3570           && is_a <gphi *> (phi_use_stmt)
3571           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3572         {
3573           if (dump_enabled_p ())
3574             report_vect_op (MSG_NOTE, def_stmt,
3575                             "detected double reduction: ");
3576
3577           *double_reduc = true;
3578           return def_stmt_info;
3579         }
3580
3581       return NULL;
3582     }
3583
3584   /* Look for the expression computing latch_def from then loop PHI result.  */
3585   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3586   enum tree_code code;
3587   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3588                             path))
3589     {
3590       STMT_VINFO_REDUC_CODE (phi_info) = code;
3591       if (code == COND_EXPR && !nested_in_vect_loop)
3592         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3593
3594       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3595          reduction chain for which the additional restriction is that
3596          all operations in the chain are the same.  */
3597       auto_vec<stmt_vec_info, 8> reduc_chain;
3598       unsigned i;
3599       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3600       for (i = path.length () - 1; i >= 1; --i)
3601         {
3602           gimple *stmt = USE_STMT (path[i].second);
3603           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3604           STMT_VINFO_REDUC_IDX (stmt_info)
3605             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3606           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3607           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3608                                      && (i == 1 || i == path.length () - 1));
3609           if ((stmt_code != code && !leading_conversion)
3610               /* We can only handle the final value in epilogue
3611                  generation for reduction chains.  */
3612               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3613             is_slp_reduc = false;
3614           /* For reduction chains we support a trailing/leading
3615              conversions.  We do not store those in the actual chain.  */
3616           if (leading_conversion)
3617             continue;
3618           reduc_chain.safe_push (stmt_info);
3619         }
3620       if (is_slp_reduc && reduc_chain.length () > 1)
3621         {
3622           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3623             {
3624               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3625               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3626             }
3627           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3628           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3629
3630           /* Save the chain for further analysis in SLP detection.  */
3631           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3632           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3633
3634           *reduc_chain_p = true;
3635           if (dump_enabled_p ())
3636             dump_printf_loc (MSG_NOTE, vect_location,
3637                             "reduction: detected reduction chain\n");
3638         }
3639       else if (dump_enabled_p ())
3640         dump_printf_loc (MSG_NOTE, vect_location,
3641                          "reduction: detected reduction\n");
3642
3643       return def_stmt_info;
3644     }
3645
3646   if (dump_enabled_p ())
3647     dump_printf_loc (MSG_NOTE, vect_location,
3648                      "reduction: unknown pattern\n");
3649
3650   return NULL;
3651 }
3652
3653 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3654    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3655    or -1 if not known.  */
3656
3657 static int
3658 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3659 {
3660   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3661   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3662     {
3663       if (dump_enabled_p ())
3664         dump_printf_loc (MSG_NOTE, vect_location,
3665                          "cost model: epilogue peel iters set to vf/2 "
3666                          "because loop iterations are unknown .\n");
3667       return assumed_vf / 2;
3668     }
3669   else
3670     {
3671       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3672       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3673       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3674       /* If we need to peel for gaps, but no peeling is required, we have to
3675          peel VF iterations.  */
3676       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3677         peel_iters_epilogue = assumed_vf;
3678       return peel_iters_epilogue;
3679     }
3680 }
3681
3682 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3683 int
3684 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3685                              int *peel_iters_epilogue,
3686                              stmt_vector_for_cost *scalar_cost_vec,
3687                              stmt_vector_for_cost *prologue_cost_vec,
3688                              stmt_vector_for_cost *epilogue_cost_vec)
3689 {
3690   int retval = 0;
3691
3692   *peel_iters_epilogue
3693     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3694
3695   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3696     {
3697       /* If peeled iterations are known but number of scalar loop
3698          iterations are unknown, count a taken branch per peeled loop.  */
3699       if (peel_iters_prologue > 0)
3700         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3701                                    NULL, NULL_TREE, 0, vect_prologue);
3702       if (*peel_iters_epilogue > 0)
3703         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3704                                     NULL, NULL_TREE, 0, vect_epilogue);
3705     }
3706
3707   stmt_info_for_cost *si;
3708   int j;
3709   if (peel_iters_prologue)
3710     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3711       retval += record_stmt_cost (prologue_cost_vec,
3712                                   si->count * peel_iters_prologue,
3713                                   si->kind, si->stmt_info, si->misalign,
3714                                   vect_prologue);
3715   if (*peel_iters_epilogue)
3716     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3717       retval += record_stmt_cost (epilogue_cost_vec,
3718                                   si->count * *peel_iters_epilogue,
3719                                   si->kind, si->stmt_info, si->misalign,
3720                                   vect_epilogue);
3721
3722   return retval;
3723 }
3724
3725 /* Function vect_estimate_min_profitable_iters
3726
3727    Return the number of iterations required for the vector version of the
3728    loop to be profitable relative to the cost of the scalar version of the
3729    loop.
3730
3731    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3732    of iterations for vectorization.  -1 value means loop vectorization
3733    is not profitable.  This returned value may be used for dynamic
3734    profitability check.
3735
3736    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3737    for static check against estimated number of iterations.  */
3738
3739 static void
3740 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3741                                     int *ret_min_profitable_niters,
3742                                     int *ret_min_profitable_estimate)
3743 {
3744   int min_profitable_iters;
3745   int min_profitable_estimate;
3746   int peel_iters_prologue;
3747   int peel_iters_epilogue;
3748   unsigned vec_inside_cost = 0;
3749   int vec_outside_cost = 0;
3750   unsigned vec_prologue_cost = 0;
3751   unsigned vec_epilogue_cost = 0;
3752   int scalar_single_iter_cost = 0;
3753   int scalar_outside_cost = 0;
3754   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3755   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3756   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3757
3758   /* Cost model disabled.  */
3759   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3760     {
3761       if (dump_enabled_p ())
3762         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3763       *ret_min_profitable_niters = 0;
3764       *ret_min_profitable_estimate = 0;
3765       return;
3766     }
3767
3768   /* Requires loop versioning tests to handle misalignment.  */
3769   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3770     {
3771       /*  FIXME: Make cost depend on complexity of individual check.  */
3772       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3773       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3774                             NULL, NULL_TREE, 0, vect_prologue);
3775       if (dump_enabled_p ())
3776         dump_printf (MSG_NOTE,
3777                      "cost model: Adding cost of checks for loop "
3778                      "versioning to treat misalignment.\n");
3779     }
3780
3781   /* Requires loop versioning with alias checks.  */
3782   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3783     {
3784       /*  FIXME: Make cost depend on complexity of individual check.  */
3785       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3786       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3787                             NULL, NULL_TREE, 0, vect_prologue);
3788       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3789       if (len)
3790         /* Count LEN - 1 ANDs and LEN comparisons.  */
3791         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3792                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3793       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3794       if (len)
3795         {
3796           /* Count LEN - 1 ANDs and LEN comparisons.  */
3797           unsigned int nstmts = len * 2 - 1;
3798           /* +1 for each bias that needs adding.  */
3799           for (unsigned int i = 0; i < len; ++i)
3800             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3801               nstmts += 1;
3802           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3803                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3804         }
3805       if (dump_enabled_p ())
3806         dump_printf (MSG_NOTE,
3807                      "cost model: Adding cost of checks for loop "
3808                      "versioning aliasing.\n");
3809     }
3810
3811   /* Requires loop versioning with niter checks.  */
3812   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3813     {
3814       /*  FIXME: Make cost depend on complexity of individual check.  */
3815       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3816                             NULL, NULL_TREE, 0, vect_prologue);
3817       if (dump_enabled_p ())
3818         dump_printf (MSG_NOTE,
3819                      "cost model: Adding cost of checks for loop "
3820                      "versioning niters.\n");
3821     }
3822
3823   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3824     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3825                           NULL, NULL_TREE, 0, vect_prologue);
3826
3827   /* Count statements in scalar loop.  Using this as scalar cost for a single
3828      iteration for now.
3829
3830      TODO: Add outer loop support.
3831
3832      TODO: Consider assigning different costs to different scalar
3833      statements.  */
3834
3835   scalar_single_iter_cost
3836     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3837
3838   /* Add additional cost for the peeled instructions in prologue and epilogue
3839      loop.  (For fully-masked loops there will be no peeling.)
3840
3841      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3842      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3843
3844      TODO: Build an expression that represents peel_iters for prologue and
3845      epilogue to be used in a run-time test.  */
3846
3847   bool prologue_need_br_taken_cost = false;
3848   bool prologue_need_br_not_taken_cost = false;
3849
3850   /* Calculate peel_iters_prologue.  */
3851   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3852     peel_iters_prologue = 0;
3853   else if (npeel < 0)
3854     {
3855       peel_iters_prologue = assumed_vf / 2;
3856       if (dump_enabled_p ())
3857         dump_printf (MSG_NOTE, "cost model: "
3858                      "prologue peel iters set to vf/2.\n");
3859
3860       /* If peeled iterations are unknown, count a taken branch and a not taken
3861          branch per peeled loop.  Even if scalar loop iterations are known,
3862          vector iterations are not known since peeled prologue iterations are
3863          not known.  Hence guards remain the same.  */
3864       prologue_need_br_taken_cost = true;
3865       prologue_need_br_not_taken_cost = true;
3866     }
3867   else
3868     {
3869       peel_iters_prologue = npeel;
3870       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3871         /* If peeled iterations are known but number of scalar loop
3872            iterations are unknown, count a taken branch per peeled loop.  */
3873         prologue_need_br_taken_cost = true;
3874     }
3875
3876   bool epilogue_need_br_taken_cost = false;
3877   bool epilogue_need_br_not_taken_cost = false;
3878
3879   /* Calculate peel_iters_epilogue.  */
3880   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3881     /* We need to peel exactly one iteration for gaps.  */
3882     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3883   else if (npeel < 0)
3884     {
3885       /* If peeling for alignment is unknown, loop bound of main loop
3886          becomes unknown.  */
3887       peel_iters_epilogue = assumed_vf / 2;
3888       if (dump_enabled_p ())
3889         dump_printf (MSG_NOTE, "cost model: "
3890                      "epilogue peel iters set to vf/2 because "
3891                      "peeling for alignment is unknown.\n");
3892
3893       /* See the same reason above in peel_iters_prologue calculation.  */
3894       epilogue_need_br_taken_cost = true;
3895       epilogue_need_br_not_taken_cost = true;
3896     }
3897   else
3898     {
3899       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3900       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3901         /* If peeled iterations are known but number of scalar loop
3902            iterations are unknown, count a taken branch per peeled loop.  */
3903         epilogue_need_br_taken_cost = true;
3904     }
3905
3906   stmt_info_for_cost *si;
3907   int j;
3908   /* Add costs associated with peel_iters_prologue.  */
3909   if (peel_iters_prologue)
3910     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3911       {
3912         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3913                               si->count * peel_iters_prologue, si->kind,
3914                               si->stmt_info, si->vectype, si->misalign,
3915                               vect_prologue);
3916       }
3917
3918   /* Add costs associated with peel_iters_epilogue.  */
3919   if (peel_iters_epilogue)
3920     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3921       {
3922         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3923                               si->count * peel_iters_epilogue, si->kind,
3924                               si->stmt_info, si->vectype, si->misalign,
3925                               vect_epilogue);
3926       }
3927
3928   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3929
3930   if (prologue_need_br_taken_cost)
3931     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3932                           NULL, NULL_TREE, 0, vect_prologue);
3933
3934   if (prologue_need_br_not_taken_cost)
3935     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3936                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3937                           vect_prologue);
3938
3939   if (epilogue_need_br_taken_cost)
3940     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3941                           NULL, NULL_TREE, 0, vect_epilogue);
3942
3943   if (epilogue_need_br_not_taken_cost)
3944     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3945                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3946                           vect_epilogue);
3947
3948   /* Take care of special costs for rgroup controls of partial vectors.  */
3949   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3950     {
3951       /* Calculate how many masks we need to generate.  */
3952       unsigned int num_masks = 0;
3953       rgroup_controls *rgm;
3954       unsigned int num_vectors_m1;
3955       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3956         if (rgm->type)
3957           num_masks += num_vectors_m1 + 1;
3958       gcc_assert (num_masks > 0);
3959
3960       /* In the worst case, we need to generate each mask in the prologue
3961          and in the loop body.  One of the loop body mask instructions
3962          replaces the comparison in the scalar loop, and since we don't
3963          count the scalar comparison against the scalar body, we shouldn't
3964          count that vector instruction against the vector body either.
3965
3966          Sometimes we can use unpacks instead of generating prologue
3967          masks and sometimes the prologue mask will fold to a constant,
3968          so the actual prologue cost might be smaller.  However, it's
3969          simpler and safer to use the worst-case cost; if this ends up
3970          being the tie-breaker between vectorizing or not, then it's
3971          probably better not to vectorize.  */
3972       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3973                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3974       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3975                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
3976     }
3977   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3978     {
3979       /* Referring to the functions vect_set_loop_condition_partial_vectors
3980          and vect_set_loop_controls_directly, we need to generate each
3981          length in the prologue and in the loop body if required. Although
3982          there are some possible optimizations, we consider the worst case
3983          here.  */
3984
3985       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3986       bool need_iterate_p
3987         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3988            && !vect_known_niters_smaller_than_vf (loop_vinfo));
3989
3990       /* Calculate how many statements to be added.  */
3991       unsigned int prologue_stmts = 0;
3992       unsigned int body_stmts = 0;
3993
3994       rgroup_controls *rgc;
3995       unsigned int num_vectors_m1;
3996       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3997         if (rgc->type)
3998           {
3999             /* May need one SHIFT for nitems_total computation.  */
4000             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4001             if (nitems != 1 && !niters_known_p)
4002               prologue_stmts += 1;
4003
4004             /* May need one MAX and one MINUS for wrap around.  */
4005             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4006               prologue_stmts += 2;
4007
4008             /* Need one MAX and one MINUS for each batch limit excepting for
4009                the 1st one.  */
4010             prologue_stmts += num_vectors_m1 * 2;
4011
4012             unsigned int num_vectors = num_vectors_m1 + 1;
4013
4014             /* Need to set up lengths in prologue, only one MIN required
4015                for each since start index is zero.  */
4016             prologue_stmts += num_vectors;
4017
4018             /* Each may need two MINs and one MINUS to update lengths in body
4019                for next iteration.  */
4020             if (need_iterate_p)
4021               body_stmts += 3 * num_vectors;
4022           }
4023
4024       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4025                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4026       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4027                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4028     }
4029
4030   /* FORNOW: The scalar outside cost is incremented in one of the
4031      following ways:
4032
4033      1. The vectorizer checks for alignment and aliasing and generates
4034      a condition that allows dynamic vectorization.  A cost model
4035      check is ANDED with the versioning condition.  Hence scalar code
4036      path now has the added cost of the versioning check.
4037
4038        if (cost > th & versioning_check)
4039          jmp to vector code
4040
4041      Hence run-time scalar is incremented by not-taken branch cost.
4042
4043      2. The vectorizer then checks if a prologue is required.  If the
4044      cost model check was not done before during versioning, it has to
4045      be done before the prologue check.
4046
4047        if (cost <= th)
4048          prologue = scalar_iters
4049        if (prologue == 0)
4050          jmp to vector code
4051        else
4052          execute prologue
4053        if (prologue == num_iters)
4054          go to exit
4055
4056      Hence the run-time scalar cost is incremented by a taken branch,
4057      plus a not-taken branch, plus a taken branch cost.
4058
4059      3. The vectorizer then checks if an epilogue is required.  If the
4060      cost model check was not done before during prologue check, it
4061      has to be done with the epilogue check.
4062
4063        if (prologue == 0)
4064          jmp to vector code
4065        else
4066          execute prologue
4067        if (prologue == num_iters)
4068          go to exit
4069        vector code:
4070          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4071            jmp to epilogue
4072
4073      Hence the run-time scalar cost should be incremented by 2 taken
4074      branches.
4075
4076      TODO: The back end may reorder the BBS's differently and reverse
4077      conditions/branch directions.  Change the estimates below to
4078      something more reasonable.  */
4079
4080   /* If the number of iterations is known and we do not do versioning, we can
4081      decide whether to vectorize at compile time.  Hence the scalar version
4082      do not carry cost model guard costs.  */
4083   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4084       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4085     {
4086       /* Cost model check occurs at versioning.  */
4087       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4088         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4089       else
4090         {
4091           /* Cost model check occurs at prologue generation.  */
4092           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4093             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4094               + vect_get_stmt_cost (cond_branch_not_taken);
4095           /* Cost model check occurs at epilogue generation.  */
4096           else
4097             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4098         }
4099     }
4100
4101   /* Complete the target-specific cost calculations.  */
4102   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4103                &vec_inside_cost, &vec_epilogue_cost);
4104
4105   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4106
4107   /* Stash the costs so that we can compare two loop_vec_infos.  */
4108   loop_vinfo->vec_inside_cost = vec_inside_cost;
4109   loop_vinfo->vec_outside_cost = vec_outside_cost;
4110
4111   if (dump_enabled_p ())
4112     {
4113       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4114       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4115                    vec_inside_cost);
4116       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4117                    vec_prologue_cost);
4118       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4119                    vec_epilogue_cost);
4120       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4121                    scalar_single_iter_cost);
4122       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4123                    scalar_outside_cost);
4124       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4125                    vec_outside_cost);
4126       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4127                    peel_iters_prologue);
4128       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4129                    peel_iters_epilogue);
4130     }
4131
4132   /* Calculate number of iterations required to make the vector version
4133      profitable, relative to the loop bodies only.  The following condition
4134      must hold true:
4135      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4136      where
4137      SIC = scalar iteration cost, VIC = vector iteration cost,
4138      VOC = vector outside cost, VF = vectorization factor,
4139      NPEEL = prologue iterations + epilogue iterations,
4140      SOC = scalar outside cost for run time cost model check.  */
4141
4142   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4143                           - vec_inside_cost);
4144   if (saving_per_viter <= 0)
4145     {
4146       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4147         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4148                     "vectorization did not happen for a simd loop");
4149
4150       if (dump_enabled_p ())
4151         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4152                          "cost model: the vector iteration cost = %d "
4153                          "divided by the scalar iteration cost = %d "
4154                          "is greater or equal to the vectorization factor = %d"
4155                          ".\n",
4156                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4157       *ret_min_profitable_niters = -1;
4158       *ret_min_profitable_estimate = -1;
4159       return;
4160     }
4161
4162   /* ??? The "if" arm is written to handle all cases; see below for what
4163      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4164   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4165     {
4166       /* Rewriting the condition above in terms of the number of
4167          vector iterations (vniters) rather than the number of
4168          scalar iterations (niters) gives:
4169
4170          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4171
4172          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4173
4174          For integer N, X and Y when X > 0:
4175
4176          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4177       int outside_overhead = (vec_outside_cost
4178                               - scalar_single_iter_cost * peel_iters_prologue
4179                               - scalar_single_iter_cost * peel_iters_epilogue
4180                               - scalar_outside_cost);
4181       /* We're only interested in cases that require at least one
4182          vector iteration.  */
4183       int min_vec_niters = 1;
4184       if (outside_overhead > 0)
4185         min_vec_niters = outside_overhead / saving_per_viter + 1;
4186
4187       if (dump_enabled_p ())
4188         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4189                      min_vec_niters);
4190
4191       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4192         {
4193           /* Now that we know the minimum number of vector iterations,
4194              find the minimum niters for which the scalar cost is larger:
4195
4196              SIC * niters > VIC * vniters + VOC - SOC
4197
4198              We know that the minimum niters is no more than
4199              vniters * VF + NPEEL, but it might be (and often is) less
4200              than that if a partial vector iteration is cheaper than the
4201              equivalent scalar code.  */
4202           int threshold = (vec_inside_cost * min_vec_niters
4203                            + vec_outside_cost
4204                            - scalar_outside_cost);
4205           if (threshold <= 0)
4206             min_profitable_iters = 1;
4207           else
4208             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4209         }
4210       else
4211         /* Convert the number of vector iterations into a number of
4212            scalar iterations.  */
4213         min_profitable_iters = (min_vec_niters * assumed_vf
4214                                 + peel_iters_prologue
4215                                 + peel_iters_epilogue);
4216     }
4217   else
4218     {
4219       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4220                               * assumed_vf
4221                               - vec_inside_cost * peel_iters_prologue
4222                               - vec_inside_cost * peel_iters_epilogue);
4223       if (min_profitable_iters <= 0)
4224         min_profitable_iters = 0;
4225       else
4226         {
4227           min_profitable_iters /= saving_per_viter;
4228
4229           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4230               <= (((int) vec_inside_cost * min_profitable_iters)
4231                   + (((int) vec_outside_cost - scalar_outside_cost)
4232                      * assumed_vf)))
4233             min_profitable_iters++;
4234         }
4235     }
4236
4237   if (dump_enabled_p ())
4238     dump_printf (MSG_NOTE,
4239                  "  Calculated minimum iters for profitability: %d\n",
4240                  min_profitable_iters);
4241
4242   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4243       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4244     /* We want the vectorized loop to execute at least once.  */
4245     min_profitable_iters = assumed_vf + peel_iters_prologue;
4246   else if (min_profitable_iters < peel_iters_prologue)
4247     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4248        vectorized loop executes at least once.  */
4249     min_profitable_iters = peel_iters_prologue;
4250
4251   if (dump_enabled_p ())
4252     dump_printf_loc (MSG_NOTE, vect_location,
4253                      "  Runtime profitability threshold = %d\n",
4254                      min_profitable_iters);
4255
4256   *ret_min_profitable_niters = min_profitable_iters;
4257
4258   /* Calculate number of iterations required to make the vector version
4259      profitable, relative to the loop bodies only.
4260
4261      Non-vectorized variant is SIC * niters and it must win over vector
4262      variant on the expected loop trip count.  The following condition must hold true:
4263      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4264
4265   if (vec_outside_cost <= 0)
4266     min_profitable_estimate = 0;
4267   /* ??? This "else if" arm is written to handle all cases; see below for
4268      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4269   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4270     {
4271       /* This is a repeat of the code above, but with + SOC rather
4272          than - SOC.  */
4273       int outside_overhead = (vec_outside_cost
4274                               - scalar_single_iter_cost * peel_iters_prologue
4275                               - scalar_single_iter_cost * peel_iters_epilogue
4276                               + scalar_outside_cost);
4277       int min_vec_niters = 1;
4278       if (outside_overhead > 0)
4279         min_vec_niters = outside_overhead / saving_per_viter + 1;
4280
4281       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4282         {
4283           int threshold = (vec_inside_cost * min_vec_niters
4284                            + vec_outside_cost
4285                            + scalar_outside_cost);
4286           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4287         }
4288       else
4289         min_profitable_estimate = (min_vec_niters * assumed_vf
4290                                    + peel_iters_prologue
4291                                    + peel_iters_epilogue);
4292     }
4293   else
4294     {
4295       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4296                                  * assumed_vf
4297                                  - vec_inside_cost * peel_iters_prologue
4298                                  - vec_inside_cost * peel_iters_epilogue)
4299                                  / ((scalar_single_iter_cost * assumed_vf)
4300                                    - vec_inside_cost);
4301     }
4302   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4303   if (dump_enabled_p ())
4304     dump_printf_loc (MSG_NOTE, vect_location,
4305                      "  Static estimate profitability threshold = %d\n",
4306                      min_profitable_estimate);
4307
4308   *ret_min_profitable_estimate = min_profitable_estimate;
4309 }
4310
4311 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4312    vector elements (not bits) for a vector with NELT elements.  */
4313 static void
4314 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4315                               vec_perm_builder *sel)
4316 {
4317   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4318      by vec_perm_indices.  */
4319   sel->new_vector (nelt, 1, 3);
4320   for (unsigned int i = 0; i < 3; i++)
4321     sel->quick_push (i + offset);
4322 }
4323
4324 /* Checks whether the target supports whole-vector shifts for vectors of mode
4325    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4326    it supports vec_perm_const with masks for all necessary shift amounts.  */
4327 static bool
4328 have_whole_vector_shift (machine_mode mode)
4329 {
4330   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4331     return true;
4332
4333   /* Variable-length vectors should be handled via the optab.  */
4334   unsigned int nelt;
4335   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4336     return false;
4337
4338   vec_perm_builder sel;
4339   vec_perm_indices indices;
4340   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4341     {
4342       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4343       indices.new_vector (sel, 2, nelt);
4344       if (!can_vec_perm_const_p (mode, indices, false))
4345         return false;
4346     }
4347   return true;
4348 }
4349
4350 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4351    functions. Design better to avoid maintenance issues.  */
4352
4353 /* Function vect_model_reduction_cost.
4354
4355    Models cost for a reduction operation, including the vector ops
4356    generated within the strip-mine loop, the initial definition before
4357    the loop, and the epilogue code that must be generated.  */
4358
4359 static void
4360 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4361                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4362                            vect_reduction_type reduction_type,
4363                            int ncopies, stmt_vector_for_cost *cost_vec)
4364 {
4365   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4366   enum tree_code code;
4367   optab optab;
4368   tree vectype;
4369   machine_mode mode;
4370   class loop *loop = NULL;
4371
4372   if (loop_vinfo)
4373     loop = LOOP_VINFO_LOOP (loop_vinfo);
4374
4375   /* Condition reductions generate two reductions in the loop.  */
4376   if (reduction_type == COND_REDUCTION)
4377     ncopies *= 2;
4378
4379   vectype = STMT_VINFO_VECTYPE (stmt_info);
4380   mode = TYPE_MODE (vectype);
4381   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4382
4383   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4384
4385   if (reduction_type == EXTRACT_LAST_REDUCTION)
4386     /* No extra instructions are needed in the prologue.  The loop body
4387        operations are costed in vectorizable_condition.  */
4388     inside_cost = 0;
4389   else if (reduction_type == FOLD_LEFT_REDUCTION)
4390     {
4391       /* No extra instructions needed in the prologue.  */
4392       prologue_cost = 0;
4393
4394       if (reduc_fn != IFN_LAST)
4395         /* Count one reduction-like operation per vector.  */
4396         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4397                                         stmt_info, 0, vect_body);
4398       else
4399         {
4400           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4401           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4402           inside_cost = record_stmt_cost (cost_vec, nelements,
4403                                           vec_to_scalar, stmt_info, 0,
4404                                           vect_body);
4405           inside_cost += record_stmt_cost (cost_vec, nelements,
4406                                            scalar_stmt, stmt_info, 0,
4407                                            vect_body);
4408         }
4409     }
4410   else
4411     {
4412       /* Add in cost for initial definition.
4413          For cond reduction we have four vectors: initial index, step,
4414          initial result of the data reduction, initial value of the index
4415          reduction.  */
4416       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4417       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4418                                          scalar_to_vec, stmt_info, 0,
4419                                          vect_prologue);
4420
4421       /* Cost of reduction op inside loop.  */
4422       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4423                                       stmt_info, 0, vect_body);
4424     }
4425
4426   /* Determine cost of epilogue code.
4427
4428      We have a reduction operator that will reduce the vector in one statement.
4429      Also requires scalar extract.  */
4430
4431   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4432     {
4433       if (reduc_fn != IFN_LAST)
4434         {
4435           if (reduction_type == COND_REDUCTION)
4436             {
4437               /* An EQ stmt and an COND_EXPR stmt.  */
4438               epilogue_cost += record_stmt_cost (cost_vec, 2,
4439                                                  vector_stmt, stmt_info, 0,
4440                                                  vect_epilogue);
4441               /* Reduction of the max index and a reduction of the found
4442                  values.  */
4443               epilogue_cost += record_stmt_cost (cost_vec, 2,
4444                                                  vec_to_scalar, stmt_info, 0,
4445                                                  vect_epilogue);
4446               /* A broadcast of the max value.  */
4447               epilogue_cost += record_stmt_cost (cost_vec, 1,
4448                                                  scalar_to_vec, stmt_info, 0,
4449                                                  vect_epilogue);
4450             }
4451           else
4452             {
4453               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4454                                                  stmt_info, 0, vect_epilogue);
4455               epilogue_cost += record_stmt_cost (cost_vec, 1,
4456                                                  vec_to_scalar, stmt_info, 0,
4457                                                  vect_epilogue);
4458             }
4459         }
4460       else if (reduction_type == COND_REDUCTION)
4461         {
4462           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4463           /* Extraction of scalar elements.  */
4464           epilogue_cost += record_stmt_cost (cost_vec,
4465                                              2 * estimated_nunits,
4466                                              vec_to_scalar, stmt_info, 0,
4467                                              vect_epilogue);
4468           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4469           epilogue_cost += record_stmt_cost (cost_vec,
4470                                              2 * estimated_nunits - 3,
4471                                              scalar_stmt, stmt_info, 0,
4472                                              vect_epilogue);
4473         }
4474       else if (reduction_type == EXTRACT_LAST_REDUCTION
4475                || reduction_type == FOLD_LEFT_REDUCTION)
4476         /* No extra instructions need in the epilogue.  */
4477         ;
4478       else
4479         {
4480           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4481           tree bitsize =
4482             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4483           int element_bitsize = tree_to_uhwi (bitsize);
4484           int nelements = vec_size_in_bits / element_bitsize;
4485
4486           if (code == COND_EXPR)
4487             code = MAX_EXPR;
4488
4489           optab = optab_for_tree_code (code, vectype, optab_default);
4490
4491           /* We have a whole vector shift available.  */
4492           if (optab != unknown_optab
4493               && VECTOR_MODE_P (mode)
4494               && optab_handler (optab, mode) != CODE_FOR_nothing
4495               && have_whole_vector_shift (mode))
4496             {
4497               /* Final reduction via vector shifts and the reduction operator.
4498                  Also requires scalar extract.  */
4499               epilogue_cost += record_stmt_cost (cost_vec,
4500                                                  exact_log2 (nelements) * 2,
4501                                                  vector_stmt, stmt_info, 0,
4502                                                  vect_epilogue);
4503               epilogue_cost += record_stmt_cost (cost_vec, 1,
4504                                                  vec_to_scalar, stmt_info, 0,
4505                                                  vect_epilogue);
4506             }
4507           else
4508             /* Use extracts and reduction op for final reduction.  For N
4509                elements, we have N extracts and N-1 reduction ops.  */
4510             epilogue_cost += record_stmt_cost (cost_vec,
4511                                                nelements + nelements - 1,
4512                                                vector_stmt, stmt_info, 0,
4513                                                vect_epilogue);
4514         }
4515     }
4516
4517   if (dump_enabled_p ())
4518     dump_printf (MSG_NOTE,
4519                  "vect_model_reduction_cost: inside_cost = %d, "
4520                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4521                  prologue_cost, epilogue_cost);
4522 }
4523
4524
4525
4526 /* Function get_initial_def_for_reduction
4527
4528    Input:
4529    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4530    INIT_VAL - the initial value of the reduction variable
4531
4532    Output:
4533    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4534         of the reduction (used for adjusting the epilog - see below).
4535    Return a vector variable, initialized according to the operation that
4536         STMT_VINFO performs. This vector will be used as the initial value
4537         of the vector of partial results.
4538
4539    Option1 (adjust in epilog): Initialize the vector as follows:
4540      add/bit or/xor:    [0,0,...,0,0]
4541      mult/bit and:      [1,1,...,1,1]
4542      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4543    and when necessary (e.g. add/mult case) let the caller know
4544    that it needs to adjust the result by init_val.
4545
4546    Option2: Initialize the vector as follows:
4547      add/bit or/xor:    [init_val,0,0,...,0]
4548      mult/bit and:      [init_val,1,1,...,1]
4549      min/max/cond_expr: [init_val,init_val,...,init_val]
4550    and no adjustments are needed.
4551
4552    For example, for the following code:
4553
4554    s = init_val;
4555    for (i=0;i<n;i++)
4556      s = s + a[i];
4557
4558    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4559    For a vector of 4 units, we want to return either [0,0,0,init_val],
4560    or [0,0,0,0] and let the caller know that it needs to adjust
4561    the result at the end by 'init_val'.
4562
4563    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4564    initialization vector is simpler (same element in all entries), if
4565    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4566
4567    A cost model should help decide between these two schemes.  */
4568
4569 static tree
4570 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4571                                stmt_vec_info stmt_vinfo,
4572                                enum tree_code code, tree init_val,
4573                                tree *adjustment_def)
4574 {
4575   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4576   tree scalar_type = TREE_TYPE (init_val);
4577   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4578   tree def_for_init;
4579   tree init_def;
4580   REAL_VALUE_TYPE real_init_val = dconst0;
4581   int int_init_val = 0;
4582   gimple_seq stmts = NULL;
4583
4584   gcc_assert (vectype);
4585
4586   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4587               || SCALAR_FLOAT_TYPE_P (scalar_type));
4588
4589   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4590               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4591
4592   /* ADJUSTMENT_DEF is NULL when called from
4593      vect_create_epilog_for_reduction to vectorize double reduction.  */
4594   if (adjustment_def)
4595     *adjustment_def = NULL;
4596
4597   switch (code)
4598     {
4599     case WIDEN_SUM_EXPR:
4600     case DOT_PROD_EXPR:
4601     case SAD_EXPR:
4602     case PLUS_EXPR:
4603     case MINUS_EXPR:
4604     case BIT_IOR_EXPR:
4605     case BIT_XOR_EXPR:
4606     case MULT_EXPR:
4607     case BIT_AND_EXPR:
4608       {
4609         if (code == MULT_EXPR)
4610           {
4611             real_init_val = dconst1;
4612             int_init_val = 1;
4613           }
4614
4615         if (code == BIT_AND_EXPR)
4616           int_init_val = -1;
4617
4618         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4619           def_for_init = build_real (scalar_type, real_init_val);
4620         else
4621           def_for_init = build_int_cst (scalar_type, int_init_val);
4622
4623         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4624           {
4625             /* Option1: the first element is '0' or '1' as well.  */
4626             if (!operand_equal_p (def_for_init, init_val, 0))
4627               *adjustment_def = init_val;
4628             init_def = gimple_build_vector_from_val (&stmts, vectype,
4629                                                      def_for_init);
4630           }
4631         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4632           {
4633             /* Option2 (variable length): the first element is INIT_VAL.  */
4634             init_def = gimple_build_vector_from_val (&stmts, vectype,
4635                                                      def_for_init);
4636             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4637                                      vectype, init_def, init_val);
4638           }
4639         else
4640           {
4641             /* Option2: the first element is INIT_VAL.  */
4642             tree_vector_builder elts (vectype, 1, 2);
4643             elts.quick_push (init_val);
4644             elts.quick_push (def_for_init);
4645             init_def = gimple_build_vector (&stmts, &elts);
4646           }
4647       }
4648       break;
4649
4650     case MIN_EXPR:
4651     case MAX_EXPR:
4652     case COND_EXPR:
4653       {
4654         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4655         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4656       }
4657       break;
4658
4659     default:
4660       gcc_unreachable ();
4661     }
4662
4663   if (stmts)
4664     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4665   return init_def;
4666 }
4667
4668 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4669    NUMBER_OF_VECTORS is the number of vector defs to create.
4670    If NEUTRAL_OP is nonnull, introducing extra elements of that
4671    value will not change the result.  */
4672
4673 static void
4674 get_initial_defs_for_reduction (vec_info *vinfo,
4675                                 slp_tree slp_node,
4676                                 vec<tree> *vec_oprnds,
4677                                 unsigned int number_of_vectors,
4678                                 bool reduc_chain, tree neutral_op)
4679 {
4680   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4681   stmt_vec_info stmt_vinfo = stmts[0];
4682   unsigned HOST_WIDE_INT nunits;
4683   unsigned j, number_of_places_left_in_vector;
4684   tree vector_type;
4685   unsigned int group_size = stmts.length ();
4686   unsigned int i;
4687   class loop *loop;
4688
4689   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4690
4691   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4692
4693   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4694   gcc_assert (loop);
4695   edge pe = loop_preheader_edge (loop);
4696
4697   gcc_assert (!reduc_chain || neutral_op);
4698
4699   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4700      created vectors. It is greater than 1 if unrolling is performed.
4701
4702      For example, we have two scalar operands, s1 and s2 (e.g., group of
4703      strided accesses of size two), while NUNITS is four (i.e., four scalars
4704      of this type can be packed in a vector).  The output vector will contain
4705      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4706      will be 2).
4707
4708      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4709      vectors containing the operands.
4710
4711      For example, NUNITS is four as before, and the group size is 8
4712      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4713      {s5, s6, s7, s8}.  */
4714
4715   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4716     nunits = group_size;
4717
4718   number_of_places_left_in_vector = nunits;
4719   bool constant_p = true;
4720   tree_vector_builder elts (vector_type, nunits, 1);
4721   elts.quick_grow (nunits);
4722   gimple_seq ctor_seq = NULL;
4723   for (j = 0; j < nunits * number_of_vectors; ++j)
4724     {
4725       tree op;
4726       i = j % group_size;
4727       stmt_vinfo = stmts[i];
4728
4729       /* Get the def before the loop.  In reduction chain we have only
4730          one initial value.  Else we have as many as PHIs in the group.  */
4731       if (reduc_chain)
4732         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4733       else if (((vec_oprnds->length () + 1) * nunits
4734                 - number_of_places_left_in_vector >= group_size)
4735                && neutral_op)
4736         op = neutral_op;
4737       else
4738         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4739
4740       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4741       number_of_places_left_in_vector--;
4742       elts[nunits - number_of_places_left_in_vector - 1] = op;
4743       if (!CONSTANT_CLASS_P (op))
4744         constant_p = false;
4745
4746       if (number_of_places_left_in_vector == 0)
4747         {
4748           tree init;
4749           if (constant_p && !neutral_op
4750               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4751               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4752             /* Build the vector directly from ELTS.  */
4753             init = gimple_build_vector (&ctor_seq, &elts);
4754           else if (neutral_op)
4755             {
4756               /* Build a vector of the neutral value and shift the
4757                  other elements into place.  */
4758               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4759                                                    neutral_op);
4760               int k = nunits;
4761               while (k > 0 && elts[k - 1] == neutral_op)
4762                 k -= 1;
4763               while (k > 0)
4764                 {
4765                   k -= 1;
4766                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4767                                        vector_type, init, elts[k]);
4768                 }
4769             }
4770           else
4771             {
4772               /* First time round, duplicate ELTS to fill the
4773                  required number of vectors.  */
4774               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4775                                         number_of_vectors, *vec_oprnds);
4776               break;
4777             }
4778           vec_oprnds->quick_push (init);
4779
4780           number_of_places_left_in_vector = nunits;
4781           elts.new_vector (vector_type, nunits, 1);
4782           elts.quick_grow (nunits);
4783           constant_p = true;
4784         }
4785     }
4786   if (ctor_seq != NULL)
4787     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4788 }
4789
4790 /* For a statement STMT_INFO taking part in a reduction operation return
4791    the stmt_vec_info the meta information is stored on.  */
4792
4793 stmt_vec_info
4794 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4795 {
4796   stmt_info = vect_orig_stmt (stmt_info);
4797   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4798   if (!is_a <gphi *> (stmt_info->stmt)
4799       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4800     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4801   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4802   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4803     {
4804       if (gimple_phi_num_args (phi) == 1)
4805         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4806     }
4807   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4808     {
4809       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4810       stmt_vec_info info
4811           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4812       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4813         stmt_info = info;
4814     }
4815   return stmt_info;
4816 }
4817
4818 /* Function vect_create_epilog_for_reduction
4819
4820    Create code at the loop-epilog to finalize the result of a reduction
4821    computation.
4822
4823    STMT_INFO is the scalar reduction stmt that is being vectorized.
4824    SLP_NODE is an SLP node containing a group of reduction statements. The
4825      first one in this group is STMT_INFO.
4826    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4827    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4828      (counting from 0)
4829
4830    This function:
4831    1. Completes the reduction def-use cycles.
4832    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4833       by calling the function specified by REDUC_FN if available, or by
4834       other means (whole-vector shifts or a scalar loop).
4835       The function also creates a new phi node at the loop exit to preserve
4836       loop-closed form, as illustrated below.
4837
4838      The flow at the entry to this function:
4839
4840         loop:
4841           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4842           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4843           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4844         loop_exit:
4845           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4846           use <s_out0>
4847           use <s_out0>
4848
4849      The above is transformed by this function into:
4850
4851         loop:
4852           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4853           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4854           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4855         loop_exit:
4856           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4857           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4858           v_out2 = reduce <v_out1>
4859           s_out3 = extract_field <v_out2, 0>
4860           s_out4 = adjust_result <s_out3>
4861           use <s_out4>
4862           use <s_out4>
4863 */
4864
4865 static void
4866 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4867                                   stmt_vec_info stmt_info,
4868                                   slp_tree slp_node,
4869                                   slp_instance slp_node_instance)
4870 {
4871   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4872   gcc_assert (reduc_info->is_reduc_info);
4873   /* For double reductions we need to get at the inner loop reduction
4874      stmt which has the meta info attached.  Our stmt_info is that of the
4875      loop-closed PHI of the inner loop which we remember as
4876      def for the reduction PHI generation.  */
4877   bool double_reduc = false;
4878   stmt_vec_info rdef_info = stmt_info;
4879   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4880     {
4881       gcc_assert (!slp_node);
4882       double_reduc = true;
4883       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4884                                             (stmt_info->stmt, 0));
4885       stmt_info = vect_stmt_to_vectorize (stmt_info);
4886     }
4887   gphi *reduc_def_stmt
4888     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4889   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4890   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4891   tree vectype;
4892   machine_mode mode;
4893   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4894   basic_block exit_bb;
4895   tree scalar_dest;
4896   tree scalar_type;
4897   gimple *new_phi = NULL, *phi;
4898   gimple_stmt_iterator exit_gsi;
4899   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4900   gimple *epilog_stmt = NULL;
4901   gimple *exit_phi;
4902   tree bitsize;
4903   tree def;
4904   tree orig_name, scalar_result;
4905   imm_use_iterator imm_iter, phi_imm_iter;
4906   use_operand_p use_p, phi_use_p;
4907   gimple *use_stmt;
4908   bool nested_in_vect_loop = false;
4909   auto_vec<gimple *> new_phis;
4910   int j, i;
4911   auto_vec<tree> scalar_results;
4912   unsigned int group_size = 1, k;
4913   auto_vec<gimple *> phis;
4914   bool slp_reduc = false;
4915   bool direct_slp_reduc;
4916   tree new_phi_result;
4917   tree induction_index = NULL_TREE;
4918
4919   if (slp_node)
4920     group_size = SLP_TREE_LANES (slp_node);
4921
4922   if (nested_in_vect_loop_p (loop, stmt_info))
4923     {
4924       outer_loop = loop;
4925       loop = loop->inner;
4926       nested_in_vect_loop = true;
4927       gcc_assert (!slp_node);
4928     }
4929   gcc_assert (!nested_in_vect_loop || double_reduc);
4930
4931   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4932   gcc_assert (vectype);
4933   mode = TYPE_MODE (vectype);
4934
4935   tree initial_def = NULL;
4936   tree induc_val = NULL_TREE;
4937   tree adjustment_def = NULL;
4938   if (slp_node)
4939     ;
4940   else
4941     {
4942       /* Get at the scalar def before the loop, that defines the initial value
4943          of the reduction variable.  */
4944       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4945                                            loop_preheader_edge (loop));
4946       /* Optimize: for induction condition reduction, if we can't use zero
4947          for induc_val, use initial_def.  */
4948       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4949         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4950       else if (double_reduc)
4951         ;
4952       else if (nested_in_vect_loop)
4953         ;
4954       else
4955         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4956     }
4957
4958   unsigned vec_num;
4959   int ncopies;
4960   if (slp_node)
4961     {
4962       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4963       ncopies = 1;
4964     }
4965   else
4966     {
4967       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4968       vec_num = 1;
4969       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4970     }
4971
4972   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4973      which is updated with the current index of the loop for every match of
4974      the original loop's cond_expr (VEC_STMT).  This results in a vector
4975      containing the last time the condition passed for that vector lane.
4976      The first match will be a 1 to allow 0 to be used for non-matching
4977      indexes.  If there are no matches at all then the vector will be all
4978      zeroes.
4979
4980      PR92772: This algorithm is broken for architectures that support
4981      masked vectors, but do not provide fold_extract_last.  */
4982   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4983     {
4984       auto_vec<std::pair<tree, bool>, 2> ccompares;
4985       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4986       cond_info = vect_stmt_to_vectorize (cond_info);
4987       while (cond_info != reduc_info)
4988         {
4989           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4990             {
4991               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4992               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4993               ccompares.safe_push
4994                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4995                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4996             }
4997           cond_info
4998             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4999                                                  1 + STMT_VINFO_REDUC_IDX
5000                                                         (cond_info)));
5001           cond_info = vect_stmt_to_vectorize (cond_info);
5002         }
5003       gcc_assert (ccompares.length () != 0);
5004
5005       tree indx_before_incr, indx_after_incr;
5006       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5007       int scalar_precision
5008         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5009       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5010       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5011         (TYPE_MODE (vectype), cr_index_scalar_type,
5012          TYPE_VECTOR_SUBPARTS (vectype));
5013
5014       /* First we create a simple vector induction variable which starts
5015          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5016          vector size (STEP).  */
5017
5018       /* Create a {1,2,3,...} vector.  */
5019       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5020
5021       /* Create a vector of the step value.  */
5022       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5023       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5024
5025       /* Create an induction variable.  */
5026       gimple_stmt_iterator incr_gsi;
5027       bool insert_after;
5028       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5029       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5030                  insert_after, &indx_before_incr, &indx_after_incr);
5031
5032       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5033          filled with zeros (VEC_ZERO).  */
5034
5035       /* Create a vector of 0s.  */
5036       tree zero = build_zero_cst (cr_index_scalar_type);
5037       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5038
5039       /* Create a vector phi node.  */
5040       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5041       new_phi = create_phi_node (new_phi_tree, loop->header);
5042       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5043                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5044
5045       /* Now take the condition from the loops original cond_exprs
5046          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5047          every match uses values from the induction variable
5048          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5049          (NEW_PHI_TREE).
5050          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5051          the new cond_expr (INDEX_COND_EXPR).  */
5052       gimple_seq stmts = NULL;
5053       for (int i = ccompares.length () - 1; i != -1; --i)
5054         {
5055           tree ccompare = ccompares[i].first;
5056           if (ccompares[i].second)
5057             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5058                                          cr_index_vector_type,
5059                                          ccompare,
5060                                          indx_before_incr, new_phi_tree);
5061           else
5062             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5063                                          cr_index_vector_type,
5064                                          ccompare,
5065                                          new_phi_tree, indx_before_incr);
5066         }
5067       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5068
5069       /* Update the phi with the vec cond.  */
5070       induction_index = new_phi_tree;
5071       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5072                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5073     }
5074
5075   /* 2. Create epilog code.
5076         The reduction epilog code operates across the elements of the vector
5077         of partial results computed by the vectorized loop.
5078         The reduction epilog code consists of:
5079
5080         step 1: compute the scalar result in a vector (v_out2)
5081         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5082         step 3: adjust the scalar result (s_out3) if needed.
5083
5084         Step 1 can be accomplished using one the following three schemes:
5085           (scheme 1) using reduc_fn, if available.
5086           (scheme 2) using whole-vector shifts, if available.
5087           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5088                      combined.
5089
5090           The overall epilog code looks like this:
5091
5092           s_out0 = phi <s_loop>         # original EXIT_PHI
5093           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5094           v_out2 = reduce <v_out1>              # step 1
5095           s_out3 = extract_field <v_out2, 0>    # step 2
5096           s_out4 = adjust_result <s_out3>       # step 3
5097
5098           (step 3 is optional, and steps 1 and 2 may be combined).
5099           Lastly, the uses of s_out0 are replaced by s_out4.  */
5100
5101
5102   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5103          v_out1 = phi <VECT_DEF>
5104          Store them in NEW_PHIS.  */
5105   if (double_reduc)
5106     loop = outer_loop;
5107   exit_bb = single_exit (loop)->dest;
5108   new_phis.create (slp_node ? vec_num : ncopies);
5109   for (unsigned i = 0; i < vec_num; i++)
5110     {
5111       if (slp_node)
5112         def = vect_get_slp_vect_def (slp_node, i);
5113       else
5114         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5115       for (j = 0; j < ncopies; j++)
5116         {
5117           tree new_def = copy_ssa_name (def);
5118           phi = create_phi_node (new_def, exit_bb);
5119           if (j == 0)
5120             new_phis.quick_push (phi);
5121           else
5122             {
5123               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5124               new_phis.quick_push (phi);
5125             }
5126
5127           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5128         }
5129     }
5130
5131   exit_gsi = gsi_after_labels (exit_bb);
5132
5133   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5134          (i.e. when reduc_fn is not available) and in the final adjustment
5135          code (if needed).  Also get the original scalar reduction variable as
5136          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5137          represents a reduction pattern), the tree-code and scalar-def are
5138          taken from the original stmt that the pattern-stmt (STMT) replaces.
5139          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5140          are taken from STMT.  */
5141
5142   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5143   if (orig_stmt_info != stmt_info)
5144     {
5145       /* Reduction pattern  */
5146       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5147       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5148     }
5149
5150   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5151   scalar_type = TREE_TYPE (scalar_dest);
5152   scalar_results.create (group_size);
5153   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5154   bitsize = TYPE_SIZE (scalar_type);
5155
5156   /* SLP reduction without reduction chain, e.g.,
5157      # a1 = phi <a2, a0>
5158      # b1 = phi <b2, b0>
5159      a2 = operation (a1)
5160      b2 = operation (b1)  */
5161   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5162
5163   /* True if we should implement SLP_REDUC using native reduction operations
5164      instead of scalar operations.  */
5165   direct_slp_reduc = (reduc_fn != IFN_LAST
5166                       && slp_reduc
5167                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5168
5169   /* In case of reduction chain, e.g.,
5170      # a1 = phi <a3, a0>
5171      a2 = operation (a1)
5172      a3 = operation (a2),
5173
5174      we may end up with more than one vector result.  Here we reduce them to
5175      one vector.  */
5176   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5177     {
5178       gimple_seq stmts = NULL;
5179       tree first_vect = PHI_RESULT (new_phis[0]);
5180       first_vect = gimple_convert (&stmts, vectype, first_vect);
5181       for (k = 1; k < new_phis.length (); k++)
5182         {
5183           gimple *next_phi = new_phis[k];
5184           tree second_vect = PHI_RESULT (next_phi);
5185           second_vect = gimple_convert (&stmts, vectype, second_vect);
5186           first_vect = gimple_build (&stmts, code, vectype,
5187                                      first_vect, second_vect);
5188         }
5189       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5190
5191       new_phi_result = first_vect;
5192       new_phis.truncate (0);
5193       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5194     }
5195   /* Likewise if we couldn't use a single defuse cycle.  */
5196   else if (ncopies > 1)
5197     {
5198       gimple_seq stmts = NULL;
5199       tree first_vect = PHI_RESULT (new_phis[0]);
5200       first_vect = gimple_convert (&stmts, vectype, first_vect);
5201       for (int k = 1; k < ncopies; ++k)
5202         {
5203           tree second_vect = PHI_RESULT (new_phis[k]);
5204           second_vect = gimple_convert (&stmts, vectype, second_vect);
5205           first_vect = gimple_build (&stmts, code, vectype,
5206                                      first_vect, second_vect);
5207         }
5208       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5209       new_phi_result = first_vect;
5210       new_phis.truncate (0);
5211       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5212     }
5213   else
5214     new_phi_result = PHI_RESULT (new_phis[0]);
5215
5216   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5217       && reduc_fn != IFN_LAST)
5218     {
5219       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5220          various data values where the condition matched and another vector
5221          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5222          need to extract the last matching index (which will be the index with
5223          highest value) and use this to index into the data vector.
5224          For the case where there were no matches, the data vector will contain
5225          all default values and the index vector will be all zeros.  */
5226
5227       /* Get various versions of the type of the vector of indexes.  */
5228       tree index_vec_type = TREE_TYPE (induction_index);
5229       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5230       tree index_scalar_type = TREE_TYPE (index_vec_type);
5231       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5232
5233       /* Get an unsigned integer version of the type of the data vector.  */
5234       int scalar_precision
5235         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5236       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5237       tree vectype_unsigned = build_vector_type
5238         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5239
5240       /* First we need to create a vector (ZERO_VEC) of zeros and another
5241          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5242          can create using a MAX reduction and then expanding.
5243          In the case where the loop never made any matches, the max index will
5244          be zero.  */
5245
5246       /* Vector of {0, 0, 0,...}.  */
5247       tree zero_vec = build_zero_cst (vectype);
5248
5249       gimple_seq stmts = NULL;
5250       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5251       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5252
5253       /* Find maximum value from the vector of found indexes.  */
5254       tree max_index = make_ssa_name (index_scalar_type);
5255       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5256                                                           1, induction_index);
5257       gimple_call_set_lhs (max_index_stmt, max_index);
5258       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5259
5260       /* Vector of {max_index, max_index, max_index,...}.  */
5261       tree max_index_vec = make_ssa_name (index_vec_type);
5262       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5263                                                       max_index);
5264       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5265                                                         max_index_vec_rhs);
5266       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5267
5268       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5269          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5270          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5271          otherwise.  Only one value should match, resulting in a vector
5272          (VEC_COND) with one data value and the rest zeros.
5273          In the case where the loop never made any matches, every index will
5274          match, resulting in a vector with all data values (which will all be
5275          the default value).  */
5276
5277       /* Compare the max index vector to the vector of found indexes to find
5278          the position of the max value.  */
5279       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5280       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5281                                                       induction_index,
5282                                                       max_index_vec);
5283       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5284
5285       /* Use the compare to choose either values from the data vector or
5286          zero.  */
5287       tree vec_cond = make_ssa_name (vectype);
5288       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5289                                                    vec_compare, new_phi_result,
5290                                                    zero_vec);
5291       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5292
5293       /* Finally we need to extract the data value from the vector (VEC_COND)
5294          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5295          reduction, but because this doesn't exist, we can use a MAX reduction
5296          instead.  The data value might be signed or a float so we need to cast
5297          it first.
5298          In the case where the loop never made any matches, the data values are
5299          all identical, and so will reduce down correctly.  */
5300
5301       /* Make the matched data values unsigned.  */
5302       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5303       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5304                                        vec_cond);
5305       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5306                                                         VIEW_CONVERT_EXPR,
5307                                                         vec_cond_cast_rhs);
5308       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5309
5310       /* Reduce down to a scalar value.  */
5311       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5312       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5313                                                            1, vec_cond_cast);
5314       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5315       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5316
5317       /* Convert the reduced value back to the result type and set as the
5318          result.  */
5319       stmts = NULL;
5320       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5321                                data_reduc);
5322       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5323       scalar_results.safe_push (new_temp);
5324     }
5325   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5326            && reduc_fn == IFN_LAST)
5327     {
5328       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5329          idx = 0;
5330          idx_val = induction_index[0];
5331          val = data_reduc[0];
5332          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5333            if (induction_index[i] > idx_val)
5334              val = data_reduc[i], idx_val = induction_index[i];
5335          return val;  */
5336
5337       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5338       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5339       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5340       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5341       /* Enforced by vectorizable_reduction, which ensures we have target
5342          support before allowing a conditional reduction on variable-length
5343          vectors.  */
5344       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5345       tree idx_val = NULL_TREE, val = NULL_TREE;
5346       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5347         {
5348           tree old_idx_val = idx_val;
5349           tree old_val = val;
5350           idx_val = make_ssa_name (idx_eltype);
5351           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5352                                              build3 (BIT_FIELD_REF, idx_eltype,
5353                                                      induction_index,
5354                                                      bitsize_int (el_size),
5355                                                      bitsize_int (off)));
5356           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357           val = make_ssa_name (data_eltype);
5358           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5359                                              build3 (BIT_FIELD_REF,
5360                                                      data_eltype,
5361                                                      new_phi_result,
5362                                                      bitsize_int (el_size),
5363                                                      bitsize_int (off)));
5364           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365           if (off != 0)
5366             {
5367               tree new_idx_val = idx_val;
5368               if (off != v_size - el_size)
5369                 {
5370                   new_idx_val = make_ssa_name (idx_eltype);
5371                   epilog_stmt = gimple_build_assign (new_idx_val,
5372                                                      MAX_EXPR, idx_val,
5373                                                      old_idx_val);
5374                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5375                 }
5376               tree new_val = make_ssa_name (data_eltype);
5377               epilog_stmt = gimple_build_assign (new_val,
5378                                                  COND_EXPR,
5379                                                  build2 (GT_EXPR,
5380                                                          boolean_type_node,
5381                                                          idx_val,
5382                                                          old_idx_val),
5383                                                  val, old_val);
5384               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385               idx_val = new_idx_val;
5386               val = new_val;
5387             }
5388         }
5389       /* Convert the reduced value back to the result type and set as the
5390          result.  */
5391       gimple_seq stmts = NULL;
5392       val = gimple_convert (&stmts, scalar_type, val);
5393       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5394       scalar_results.safe_push (val);
5395     }
5396
5397   /* 2.3 Create the reduction code, using one of the three schemes described
5398          above. In SLP we simply need to extract all the elements from the
5399          vector (without reducing them), so we use scalar shifts.  */
5400   else if (reduc_fn != IFN_LAST && !slp_reduc)
5401     {
5402       tree tmp;
5403       tree vec_elem_type;
5404
5405       /* Case 1:  Create:
5406          v_out2 = reduc_expr <v_out1>  */
5407
5408       if (dump_enabled_p ())
5409         dump_printf_loc (MSG_NOTE, vect_location,
5410                          "Reduce using direct vector reduction.\n");
5411
5412       gimple_seq stmts = NULL;
5413       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5414       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5415       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5416                                vec_elem_type, new_phi_result);
5417       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5418       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5419
5420       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5421           && induc_val)
5422         {
5423           /* Earlier we set the initial value to be a vector if induc_val
5424              values.  Check the result and if it is induc_val then replace
5425              with the original initial value, unless induc_val is
5426              the same as initial_def already.  */
5427           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5428                                   induc_val);
5429
5430           tmp = make_ssa_name (new_scalar_dest);
5431           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5432                                              initial_def, new_temp);
5433           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5434           new_temp = tmp;
5435         }
5436
5437       scalar_results.safe_push (new_temp);
5438     }
5439   else if (direct_slp_reduc)
5440     {
5441       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5442          with the elements for other SLP statements replaced with the
5443          neutral value.  We can then do a normal reduction on each vector.  */
5444
5445       /* Enforced by vectorizable_reduction.  */
5446       gcc_assert (new_phis.length () == 1);
5447       gcc_assert (pow2p_hwi (group_size));
5448
5449       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5450       vec<stmt_vec_info> orig_phis
5451         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5452       gimple_seq seq = NULL;
5453
5454       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5455          and the same element size as VECTYPE.  */
5456       tree index = build_index_vector (vectype, 0, 1);
5457       tree index_type = TREE_TYPE (index);
5458       tree index_elt_type = TREE_TYPE (index_type);
5459       tree mask_type = truth_type_for (index_type);
5460
5461       /* Create a vector that, for each element, identifies which of
5462          the REDUC_GROUP_SIZE results should use it.  */
5463       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5464       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5465                             build_vector_from_val (index_type, index_mask));
5466
5467       /* Get a neutral vector value.  This is simply a splat of the neutral
5468          scalar value if we have one, otherwise the initial scalar value
5469          is itself a neutral value.  */
5470       tree vector_identity = NULL_TREE;
5471       tree neutral_op = NULL_TREE;
5472       if (slp_node)
5473         {
5474           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5475           neutral_op
5476             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5477                                             vectype, code, first != NULL);
5478         }
5479       if (neutral_op)
5480         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5481                                                         neutral_op);
5482       for (unsigned int i = 0; i < group_size; ++i)
5483         {
5484           /* If there's no univeral neutral value, we can use the
5485              initial scalar value from the original PHI.  This is used
5486              for MIN and MAX reduction, for example.  */
5487           if (!neutral_op)
5488             {
5489               tree scalar_value
5490                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5491                                          loop_preheader_edge (loop));
5492               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5493                                              scalar_value);
5494               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5495                                                               scalar_value);
5496             }
5497
5498           /* Calculate the equivalent of:
5499
5500              sel[j] = (index[j] == i);
5501
5502              which selects the elements of NEW_PHI_RESULT that should
5503              be included in the result.  */
5504           tree compare_val = build_int_cst (index_elt_type, i);
5505           compare_val = build_vector_from_val (index_type, compare_val);
5506           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5507                                    index, compare_val);
5508
5509           /* Calculate the equivalent of:
5510
5511              vec = seq ? new_phi_result : vector_identity;
5512
5513              VEC is now suitable for a full vector reduction.  */
5514           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5515                                    sel, new_phi_result, vector_identity);
5516
5517           /* Do the reduction and convert it to the appropriate type.  */
5518           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5519                                       TREE_TYPE (vectype), vec);
5520           scalar = gimple_convert (&seq, scalar_type, scalar);
5521           scalar_results.safe_push (scalar);
5522         }
5523       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5524     }
5525   else
5526     {
5527       bool reduce_with_shift;
5528       tree vec_temp;
5529
5530       gcc_assert (slp_reduc || new_phis.length () == 1);
5531
5532       /* See if the target wants to do the final (shift) reduction
5533          in a vector mode of smaller size and first reduce upper/lower
5534          halves against each other.  */
5535       enum machine_mode mode1 = mode;
5536       tree stype = TREE_TYPE (vectype);
5537       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5538       unsigned nunits1 = nunits;
5539       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5540           && new_phis.length () == 1)
5541         {
5542           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5543           /* For SLP reductions we have to make sure lanes match up, but
5544              since we're doing individual element final reduction reducing
5545              vector width here is even more important.
5546              ???  We can also separate lanes with permutes, for the common
5547              case of power-of-two group-size odd/even extracts would work.  */
5548           if (slp_reduc && nunits != nunits1)
5549             {
5550               nunits1 = least_common_multiple (nunits1, group_size);
5551               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5552             }
5553         }
5554       if (!slp_reduc
5555           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5556         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5557
5558       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5559                                                            stype, nunits1);
5560       reduce_with_shift = have_whole_vector_shift (mode1);
5561       if (!VECTOR_MODE_P (mode1))
5562         reduce_with_shift = false;
5563       else
5564         {
5565           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5566           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5567             reduce_with_shift = false;
5568         }
5569
5570       /* First reduce the vector to the desired vector size we should
5571          do shift reduction on by combining upper and lower halves.  */
5572       new_temp = new_phi_result;
5573       while (nunits > nunits1)
5574         {
5575           nunits /= 2;
5576           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5577                                                           stype, nunits);
5578           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5579
5580           /* The target has to make sure we support lowpart/highpart
5581              extraction, either via direct vector extract or through
5582              an integer mode punning.  */
5583           tree dst1, dst2;
5584           if (convert_optab_handler (vec_extract_optab,
5585                                      TYPE_MODE (TREE_TYPE (new_temp)),
5586                                      TYPE_MODE (vectype1))
5587               != CODE_FOR_nothing)
5588             {
5589               /* Extract sub-vectors directly once vec_extract becomes
5590                  a conversion optab.  */
5591               dst1 = make_ssa_name (vectype1);
5592               epilog_stmt
5593                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5594                                          build3 (BIT_FIELD_REF, vectype1,
5595                                                  new_temp, TYPE_SIZE (vectype1),
5596                                                  bitsize_int (0)));
5597               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5598               dst2 =  make_ssa_name (vectype1);
5599               epilog_stmt
5600                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5601                                          build3 (BIT_FIELD_REF, vectype1,
5602                                                  new_temp, TYPE_SIZE (vectype1),
5603                                                  bitsize_int (bitsize)));
5604               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5605             }
5606           else
5607             {
5608               /* Extract via punning to appropriately sized integer mode
5609                  vector.  */
5610               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5611               tree etype = build_vector_type (eltype, 2);
5612               gcc_assert (convert_optab_handler (vec_extract_optab,
5613                                                  TYPE_MODE (etype),
5614                                                  TYPE_MODE (eltype))
5615                           != CODE_FOR_nothing);
5616               tree tem = make_ssa_name (etype);
5617               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5618                                                  build1 (VIEW_CONVERT_EXPR,
5619                                                          etype, new_temp));
5620               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621               new_temp = tem;
5622               tem = make_ssa_name (eltype);
5623               epilog_stmt
5624                   = gimple_build_assign (tem, BIT_FIELD_REF,
5625                                          build3 (BIT_FIELD_REF, eltype,
5626                                                  new_temp, TYPE_SIZE (eltype),
5627                                                  bitsize_int (0)));
5628               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5629               dst1 = make_ssa_name (vectype1);
5630               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5631                                                  build1 (VIEW_CONVERT_EXPR,
5632                                                          vectype1, tem));
5633               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5634               tem = make_ssa_name (eltype);
5635               epilog_stmt
5636                   = gimple_build_assign (tem, BIT_FIELD_REF,
5637                                          build3 (BIT_FIELD_REF, eltype,
5638                                                  new_temp, TYPE_SIZE (eltype),
5639                                                  bitsize_int (bitsize)));
5640               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5641               dst2 =  make_ssa_name (vectype1);
5642               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5643                                                  build1 (VIEW_CONVERT_EXPR,
5644                                                          vectype1, tem));
5645               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5646             }
5647
5648           new_temp = make_ssa_name (vectype1);
5649           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5650           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5651           new_phis[0] = epilog_stmt;
5652         }
5653
5654       if (reduce_with_shift && !slp_reduc)
5655         {
5656           int element_bitsize = tree_to_uhwi (bitsize);
5657           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5658              for variable-length vectors and also requires direct target support
5659              for loop reductions.  */
5660           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5661           int nelements = vec_size_in_bits / element_bitsize;
5662           vec_perm_builder sel;
5663           vec_perm_indices indices;
5664
5665           int elt_offset;
5666
5667           tree zero_vec = build_zero_cst (vectype1);
5668           /* Case 2: Create:
5669              for (offset = nelements/2; offset >= 1; offset/=2)
5670                 {
5671                   Create:  va' = vec_shift <va, offset>
5672                   Create:  va = vop <va, va'>
5673                 }  */
5674
5675           tree rhs;
5676
5677           if (dump_enabled_p ())
5678             dump_printf_loc (MSG_NOTE, vect_location,
5679                              "Reduce using vector shifts\n");
5680
5681           gimple_seq stmts = NULL;
5682           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5683           for (elt_offset = nelements / 2;
5684                elt_offset >= 1;
5685                elt_offset /= 2)
5686             {
5687               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5688               indices.new_vector (sel, 2, nelements);
5689               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5690               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5691                                        new_temp, zero_vec, mask);
5692               new_temp = gimple_build (&stmts, code,
5693                                        vectype1, new_name, new_temp);
5694             }
5695           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5696
5697           /* 2.4  Extract the final scalar result.  Create:
5698              s_out3 = extract_field <v_out2, bitpos>  */
5699
5700           if (dump_enabled_p ())
5701             dump_printf_loc (MSG_NOTE, vect_location,
5702                              "extract scalar result\n");
5703
5704           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5705                         bitsize, bitsize_zero_node);
5706           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5707           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5708           gimple_assign_set_lhs (epilog_stmt, new_temp);
5709           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5710           scalar_results.safe_push (new_temp);
5711         }
5712       else
5713         {
5714           /* Case 3: Create:
5715              s = extract_field <v_out2, 0>
5716              for (offset = element_size;
5717                   offset < vector_size;
5718                   offset += element_size;)
5719                {
5720                  Create:  s' = extract_field <v_out2, offset>
5721                  Create:  s = op <s, s'>  // For non SLP cases
5722                }  */
5723
5724           if (dump_enabled_p ())
5725             dump_printf_loc (MSG_NOTE, vect_location,
5726                              "Reduce using scalar code.\n");
5727
5728           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5729           int element_bitsize = tree_to_uhwi (bitsize);
5730           tree compute_type = TREE_TYPE (vectype);
5731           gimple_seq stmts = NULL;
5732           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5733             {
5734               int bit_offset;
5735               if (gimple_code (new_phi) == GIMPLE_PHI)
5736                 vec_temp = PHI_RESULT (new_phi);
5737               else
5738                 vec_temp = gimple_assign_lhs (new_phi);
5739               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5740                                        vec_temp, bitsize, bitsize_zero_node);
5741
5742               /* In SLP we don't need to apply reduction operation, so we just
5743                  collect s' values in SCALAR_RESULTS.  */
5744               if (slp_reduc)
5745                 scalar_results.safe_push (new_temp);
5746
5747               for (bit_offset = element_bitsize;
5748                    bit_offset < vec_size_in_bits;
5749                    bit_offset += element_bitsize)
5750                 {
5751                   tree bitpos = bitsize_int (bit_offset);
5752                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5753                                            compute_type, vec_temp,
5754                                            bitsize, bitpos);
5755                   if (slp_reduc)
5756                     {
5757                       /* In SLP we don't need to apply reduction operation, so
5758                          we just collect s' values in SCALAR_RESULTS.  */
5759                       new_temp = new_name;
5760                       scalar_results.safe_push (new_name);
5761                     }
5762                   else
5763                     new_temp = gimple_build (&stmts, code, compute_type,
5764                                              new_name, new_temp);
5765                 }
5766             }
5767
5768           /* The only case where we need to reduce scalar results in SLP, is
5769              unrolling.  If the size of SCALAR_RESULTS is greater than
5770              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5771              REDUC_GROUP_SIZE.  */
5772           if (slp_reduc)
5773             {
5774               tree res, first_res, new_res;
5775
5776               /* Reduce multiple scalar results in case of SLP unrolling.  */
5777               for (j = group_size; scalar_results.iterate (j, &res);
5778                    j++)
5779                 {
5780                   first_res = scalar_results[j % group_size];
5781                   new_res = gimple_build (&stmts, code, compute_type,
5782                                           first_res, res);
5783                   scalar_results[j % group_size] = new_res;
5784                 }
5785               for (k = 0; k < group_size; k++)
5786                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5787                                                     scalar_results[k]);
5788             }
5789           else
5790             {
5791               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5792               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5793               scalar_results.safe_push (new_temp);
5794             }
5795
5796           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5797         }
5798
5799       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5800           && induc_val)
5801         {
5802           /* Earlier we set the initial value to be a vector if induc_val
5803              values.  Check the result and if it is induc_val then replace
5804              with the original initial value, unless induc_val is
5805              the same as initial_def already.  */
5806           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5807                                   induc_val);
5808
5809           tree tmp = make_ssa_name (new_scalar_dest);
5810           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5811                                              initial_def, new_temp);
5812           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5813           scalar_results[0] = tmp;
5814         }
5815     }
5816
5817   /* 2.5 Adjust the final result by the initial value of the reduction
5818          variable. (When such adjustment is not needed, then
5819          'adjustment_def' is zero).  For example, if code is PLUS we create:
5820          new_temp = loop_exit_def + adjustment_def  */
5821
5822   if (adjustment_def)
5823     {
5824       gcc_assert (!slp_reduc);
5825       gimple_seq stmts = NULL;
5826       if (nested_in_vect_loop)
5827         {
5828           new_phi = new_phis[0];
5829           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5830           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5831           new_temp = gimple_build (&stmts, code, vectype,
5832                                    PHI_RESULT (new_phi), adjustment_def);
5833         }
5834       else
5835         {
5836           new_temp = scalar_results[0];
5837           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5838           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5839           new_temp = gimple_build (&stmts, code, scalar_type,
5840                                    new_temp, adjustment_def);
5841         }
5842
5843       epilog_stmt = gimple_seq_last_stmt (stmts);
5844       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5845       if (nested_in_vect_loop)
5846         {
5847           if (!double_reduc)
5848             scalar_results.quick_push (new_temp);
5849           else
5850             scalar_results[0] = new_temp;
5851         }
5852       else
5853         scalar_results[0] = new_temp;
5854
5855       new_phis[0] = epilog_stmt;
5856     }
5857
5858   if (double_reduc)
5859     loop = loop->inner;
5860
5861   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5862           phis with new adjusted scalar results, i.e., replace use <s_out0>
5863           with use <s_out4>.
5864
5865      Transform:
5866         loop_exit:
5867           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5868           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5869           v_out2 = reduce <v_out1>
5870           s_out3 = extract_field <v_out2, 0>
5871           s_out4 = adjust_result <s_out3>
5872           use <s_out0>
5873           use <s_out0>
5874
5875      into:
5876
5877         loop_exit:
5878           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5879           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5880           v_out2 = reduce <v_out1>
5881           s_out3 = extract_field <v_out2, 0>
5882           s_out4 = adjust_result <s_out3>
5883           use <s_out4>
5884           use <s_out4> */
5885
5886
5887   /* In SLP reduction chain we reduce vector results into one vector if
5888      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5889      LHS of the last stmt in the reduction chain, since we are looking for
5890      the loop exit phi node.  */
5891   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5892     {
5893       stmt_vec_info dest_stmt_info
5894         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5895       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5896       group_size = 1;
5897     }
5898
5899   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5900      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5901      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5902      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5903      correspond to the first vector stmt, etc.
5904      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5905   if (group_size > new_phis.length ())
5906     gcc_assert (!(group_size % new_phis.length ()));
5907
5908   for (k = 0; k < group_size; k++)
5909     {
5910       if (slp_reduc)
5911         {
5912           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5913
5914           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5915           /* SLP statements can't participate in patterns.  */
5916           gcc_assert (!orig_stmt_info);
5917           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5918         }
5919
5920       if (nested_in_vect_loop)
5921         {
5922           if (double_reduc)
5923             loop = outer_loop;
5924           else
5925             gcc_unreachable ();
5926         }
5927
5928       phis.create (3);
5929       /* Find the loop-closed-use at the loop exit of the original scalar
5930          result.  (The reduction result is expected to have two immediate uses,
5931          one at the latch block, and one at the loop exit).  For double
5932          reductions we are looking for exit phis of the outer loop.  */
5933       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5934         {
5935           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5936             {
5937               if (!is_gimple_debug (USE_STMT (use_p)))
5938                 phis.safe_push (USE_STMT (use_p));
5939             }
5940           else
5941             {
5942               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5943                 {
5944                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5945
5946                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5947                     {
5948                       if (!flow_bb_inside_loop_p (loop,
5949                                              gimple_bb (USE_STMT (phi_use_p)))
5950                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5951                         phis.safe_push (USE_STMT (phi_use_p));
5952                     }
5953                 }
5954             }
5955         }
5956
5957       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5958         {
5959           /* Replace the uses:  */
5960           orig_name = PHI_RESULT (exit_phi);
5961           scalar_result = scalar_results[k];
5962           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5963             {
5964               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5965                 SET_USE (use_p, scalar_result);
5966               update_stmt (use_stmt);
5967             }
5968         }
5969
5970       phis.release ();
5971     }
5972 }
5973
5974 /* Return a vector of type VECTYPE that is equal to the vector select
5975    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5976    before GSI.  */
5977
5978 static tree
5979 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5980                      tree vec, tree identity)
5981 {
5982   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5983   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5984                                           mask, vec, identity);
5985   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5986   return cond;
5987 }
5988
5989 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5990    order, starting with LHS.  Insert the extraction statements before GSI and
5991    associate the new scalar SSA names with variable SCALAR_DEST.
5992    Return the SSA name for the result.  */
5993
5994 static tree
5995 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5996                        tree_code code, tree lhs, tree vector_rhs)
5997 {
5998   tree vectype = TREE_TYPE (vector_rhs);
5999   tree scalar_type = TREE_TYPE (vectype);
6000   tree bitsize = TYPE_SIZE (scalar_type);
6001   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6002   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6003
6004   for (unsigned HOST_WIDE_INT bit_offset = 0;
6005        bit_offset < vec_size_in_bits;
6006        bit_offset += element_bitsize)
6007     {
6008       tree bitpos = bitsize_int (bit_offset);
6009       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6010                          bitsize, bitpos);
6011
6012       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6013       rhs = make_ssa_name (scalar_dest, stmt);
6014       gimple_assign_set_lhs (stmt, rhs);
6015       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6016
6017       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6018       tree new_name = make_ssa_name (scalar_dest, stmt);
6019       gimple_assign_set_lhs (stmt, new_name);
6020       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6021       lhs = new_name;
6022     }
6023   return lhs;
6024 }
6025
6026 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6027    type of the vector input.  */
6028
6029 static internal_fn
6030 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6031 {
6032   internal_fn mask_reduc_fn;
6033
6034   switch (reduc_fn)
6035     {
6036     case IFN_FOLD_LEFT_PLUS:
6037       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6038       break;
6039
6040     default:
6041       return IFN_LAST;
6042     }
6043
6044   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6045                                       OPTIMIZE_FOR_SPEED))
6046     return mask_reduc_fn;
6047   return IFN_LAST;
6048 }
6049
6050 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6051    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6052    statement.  CODE is the operation performed by STMT_INFO and OPS are
6053    its scalar operands.  REDUC_INDEX is the index of the operand in
6054    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6055    implements in-order reduction, or IFN_LAST if we should open-code it.
6056    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6057    that should be used to control the operation in a fully-masked loop.  */
6058
6059 static bool
6060 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6061                                stmt_vec_info stmt_info,
6062                                gimple_stmt_iterator *gsi,
6063                                gimple **vec_stmt, slp_tree slp_node,
6064                                gimple *reduc_def_stmt,
6065                                tree_code code, internal_fn reduc_fn,
6066                                tree ops[3], tree vectype_in,
6067                                int reduc_index, vec_loop_masks *masks)
6068 {
6069   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6070   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6071   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6072
6073   int ncopies;
6074   if (slp_node)
6075     ncopies = 1;
6076   else
6077     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6078
6079   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6080   gcc_assert (ncopies == 1);
6081   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6082
6083   if (slp_node)
6084     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6085                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6086
6087   tree op0 = ops[1 - reduc_index];
6088
6089   int group_size = 1;
6090   stmt_vec_info scalar_dest_def_info;
6091   auto_vec<tree> vec_oprnds0;
6092   if (slp_node)
6093     {
6094       auto_vec<vec<tree> > vec_defs (2);
6095       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6096       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6097       vec_defs[0].release ();
6098       vec_defs[1].release ();
6099       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6100       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6101     }
6102   else
6103     {
6104       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6105                                      op0, &vec_oprnds0);
6106       scalar_dest_def_info = stmt_info;
6107     }
6108
6109   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6110   tree scalar_type = TREE_TYPE (scalar_dest);
6111   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6112
6113   int vec_num = vec_oprnds0.length ();
6114   gcc_assert (vec_num == 1 || slp_node);
6115   tree vec_elem_type = TREE_TYPE (vectype_out);
6116   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6117
6118   tree vector_identity = NULL_TREE;
6119   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6120     vector_identity = build_zero_cst (vectype_out);
6121
6122   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6123   int i;
6124   tree def0;
6125   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6126     {
6127       gimple *new_stmt;
6128       tree mask = NULL_TREE;
6129       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6130         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6131
6132       /* Handle MINUS by adding the negative.  */
6133       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6134         {
6135           tree negated = make_ssa_name (vectype_out);
6136           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6137           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6138           def0 = negated;
6139         }
6140
6141       if (mask && mask_reduc_fn == IFN_LAST)
6142         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6143                                     vector_identity);
6144
6145       /* On the first iteration the input is simply the scalar phi
6146          result, and for subsequent iterations it is the output of
6147          the preceding operation.  */
6148       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6149         {
6150           if (mask && mask_reduc_fn != IFN_LAST)
6151             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6152                                                    def0, mask);
6153           else
6154             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6155                                                    def0);
6156           /* For chained SLP reductions the output of the previous reduction
6157              operation serves as the input of the next. For the final statement
6158              the output cannot be a temporary - we reuse the original
6159              scalar destination of the last statement.  */
6160           if (i != vec_num - 1)
6161             {
6162               gimple_set_lhs (new_stmt, scalar_dest_var);
6163               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6164               gimple_set_lhs (new_stmt, reduc_var);
6165             }
6166         }
6167       else
6168         {
6169           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6170                                              reduc_var, def0);
6171           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6172           /* Remove the statement, so that we can use the same code paths
6173              as for statements that we've just created.  */
6174           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6175           gsi_remove (&tmp_gsi, true);
6176         }
6177
6178       if (i == vec_num - 1)
6179         {
6180           gimple_set_lhs (new_stmt, scalar_dest);
6181           vect_finish_replace_stmt (loop_vinfo,
6182                                     scalar_dest_def_info,
6183                                     new_stmt);
6184         }
6185       else
6186         vect_finish_stmt_generation (loop_vinfo,
6187                                      scalar_dest_def_info,
6188                                      new_stmt, gsi);
6189
6190       if (slp_node)
6191         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6192       else
6193         {
6194           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6195           *vec_stmt = new_stmt;
6196         }
6197     }
6198
6199   return true;
6200 }
6201
6202 /* Function is_nonwrapping_integer_induction.
6203
6204    Check if STMT_VINO (which is part of loop LOOP) both increments and
6205    does not cause overflow.  */
6206
6207 static bool
6208 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6209 {
6210   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6211   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6212   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6213   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6214   widest_int ni, max_loop_value, lhs_max;
6215   wi::overflow_type overflow = wi::OVF_NONE;
6216
6217   /* Make sure the loop is integer based.  */
6218   if (TREE_CODE (base) != INTEGER_CST
6219       || TREE_CODE (step) != INTEGER_CST)
6220     return false;
6221
6222   /* Check that the max size of the loop will not wrap.  */
6223
6224   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6225     return true;
6226
6227   if (! max_stmt_executions (loop, &ni))
6228     return false;
6229
6230   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6231                             &overflow);
6232   if (overflow)
6233     return false;
6234
6235   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6236                             TYPE_SIGN (lhs_type), &overflow);
6237   if (overflow)
6238     return false;
6239
6240   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6241           <= TYPE_PRECISION (lhs_type));
6242 }
6243
6244 /* Check if masking can be supported by inserting a conditional expression.
6245    CODE is the code for the operation.  COND_FN is the conditional internal
6246    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6247 static bool
6248 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6249                          tree vectype_in)
6250 {
6251   if (cond_fn != IFN_LAST
6252       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6253                                          OPTIMIZE_FOR_SPEED))
6254     return false;
6255
6256   switch (code)
6257     {
6258     case DOT_PROD_EXPR:
6259     case SAD_EXPR:
6260       return true;
6261
6262     default:
6263       return false;
6264     }
6265 }
6266
6267 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6268    code for the operation.  VOP is the array of operands.  MASK is the loop
6269    mask.  GSI is a statement iterator used to place the new conditional
6270    expression.  */
6271 static void
6272 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6273                       gimple_stmt_iterator *gsi)
6274 {
6275   switch (code)
6276     {
6277     case DOT_PROD_EXPR:
6278       {
6279         tree vectype = TREE_TYPE (vop[1]);
6280         tree zero = build_zero_cst (vectype);
6281         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6282         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6283                                                mask, vop[1], zero);
6284         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6285         vop[1] = masked_op1;
6286         break;
6287       }
6288
6289     case SAD_EXPR:
6290       {
6291         tree vectype = TREE_TYPE (vop[1]);
6292         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6293         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6294                                                mask, vop[1], vop[0]);
6295         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6296         vop[1] = masked_op1;
6297         break;
6298       }
6299
6300     default:
6301       gcc_unreachable ();
6302     }
6303 }
6304
6305 /* Function vectorizable_reduction.
6306
6307    Check if STMT_INFO performs a reduction operation that can be vectorized.
6308    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6309    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6310    Return true if STMT_INFO is vectorizable in this way.
6311
6312    This function also handles reduction idioms (patterns) that have been
6313    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6314    may be of this form:
6315      X = pattern_expr (arg0, arg1, ..., X)
6316    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6317    sequence that had been detected and replaced by the pattern-stmt
6318    (STMT_INFO).
6319
6320    This function also handles reduction of condition expressions, for example:
6321      for (int i = 0; i < N; i++)
6322        if (a[i] < value)
6323          last = a[i];
6324    This is handled by vectorising the loop and creating an additional vector
6325    containing the loop indexes for which "a[i] < value" was true.  In the
6326    function epilogue this is reduced to a single max value and then used to
6327    index into the vector of results.
6328
6329    In some cases of reduction patterns, the type of the reduction variable X is
6330    different than the type of the other arguments of STMT_INFO.
6331    In such cases, the vectype that is used when transforming STMT_INFO into
6332    a vector stmt is different than the vectype that is used to determine the
6333    vectorization factor, because it consists of a different number of elements
6334    than the actual number of elements that are being operated upon in parallel.
6335
6336    For example, consider an accumulation of shorts into an int accumulator.
6337    On some targets it's possible to vectorize this pattern operating on 8
6338    shorts at a time (hence, the vectype for purposes of determining the
6339    vectorization factor should be V8HI); on the other hand, the vectype that
6340    is used to create the vector form is actually V4SI (the type of the result).
6341
6342    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6343    indicates what is the actual level of parallelism (V8HI in the example), so
6344    that the right vectorization factor would be derived.  This vectype
6345    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6346    be used to create the vectorized stmt.  The right vectype for the vectorized
6347    stmt is obtained from the type of the result X:
6348       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6349
6350    This means that, contrary to "regular" reductions (or "regular" stmts in
6351    general), the following equation:
6352       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6353    does *NOT* necessarily hold for reduction patterns.  */
6354
6355 bool
6356 vectorizable_reduction (loop_vec_info loop_vinfo,
6357                         stmt_vec_info stmt_info, slp_tree slp_node,
6358                         slp_instance slp_node_instance,
6359                         stmt_vector_for_cost *cost_vec)
6360 {
6361   tree scalar_dest;
6362   tree vectype_in = NULL_TREE;
6363   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6364   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6365   stmt_vec_info cond_stmt_vinfo = NULL;
6366   tree scalar_type;
6367   int i;
6368   int ncopies;
6369   bool single_defuse_cycle = false;
6370   bool nested_cycle = false;
6371   bool double_reduc = false;
6372   int vec_num;
6373   tree tem;
6374   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6375   tree cond_reduc_val = NULL_TREE;
6376
6377   /* Make sure it was already recognized as a reduction computation.  */
6378   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6379       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6380       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6381     return false;
6382
6383   /* The stmt we store reduction analysis meta on.  */
6384   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6385   reduc_info->is_reduc_info = true;
6386
6387   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6388     {
6389       if (is_a <gphi *> (stmt_info->stmt))
6390         {
6391           if (slp_node)
6392             {
6393               /* We eventually need to set a vector type on invariant
6394                  arguments.  */
6395               unsigned j;
6396               slp_tree child;
6397               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6398                 if (!vect_maybe_update_slp_op_vectype
6399                        (child, SLP_TREE_VECTYPE (slp_node)))
6400                   {
6401                     if (dump_enabled_p ())
6402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6403                                        "incompatible vector types for "
6404                                        "invariants\n");
6405                     return false;
6406                   }
6407             }
6408           /* Analysis for double-reduction is done on the outer
6409              loop PHI, nested cycles have no further restrictions.  */
6410           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6411         }
6412       else
6413         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6414       return true;
6415     }
6416
6417   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6418   stmt_vec_info phi_info = stmt_info;
6419   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6420       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6421     {
6422       if (!is_a <gphi *> (stmt_info->stmt))
6423         {
6424           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6425           return true;
6426         }
6427       if (slp_node)
6428         {
6429           slp_node_instance->reduc_phis = slp_node;
6430           /* ???  We're leaving slp_node to point to the PHIs, we only
6431              need it to get at the number of vector stmts which wasn't
6432              yet initialized for the instance root.  */
6433         }
6434       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6435         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6436       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6437         {
6438           use_operand_p use_p;
6439           gimple *use_stmt;
6440           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6441                                      &use_p, &use_stmt);
6442           gcc_assert (res);
6443           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6444           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6445         }
6446     }
6447
6448   /* PHIs should not participate in patterns.  */
6449   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6450   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6451
6452   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6453      and compute the reduction chain length.  Discover the real
6454      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6455   tree reduc_def
6456     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6457                              loop_latch_edge
6458                                (gimple_bb (reduc_def_phi)->loop_father));
6459   unsigned reduc_chain_length = 0;
6460   bool only_slp_reduc_chain = true;
6461   stmt_info = NULL;
6462   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6463   while (reduc_def != PHI_RESULT (reduc_def_phi))
6464     {
6465       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6466       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6467       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6468         {
6469           if (dump_enabled_p ())
6470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471                              "reduction chain broken by patterns.\n");
6472           return false;
6473         }
6474       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6475         only_slp_reduc_chain = false;
6476       /* ???  For epilogue generation live members of the chain need
6477          to point back to the PHI via their original stmt for
6478          info_for_reduction to work.  */
6479       if (STMT_VINFO_LIVE_P (vdef))
6480         STMT_VINFO_REDUC_DEF (def) = phi_info;
6481       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6482       if (!assign)
6483         {
6484           if (dump_enabled_p ())
6485             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6486                              "reduction chain includes calls.\n");
6487           return false;
6488         }
6489       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6490         {
6491           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6492                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6493             {
6494               if (dump_enabled_p ())
6495                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6496                                  "conversion in the reduction chain.\n");
6497               return false;
6498             }
6499         }
6500       else if (!stmt_info)
6501         /* First non-conversion stmt.  */
6502         stmt_info = vdef;
6503       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6504       reduc_chain_length++;
6505       if (!stmt_info && slp_node)
6506         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6507     }
6508   /* PHIs should not participate in patterns.  */
6509   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6510
6511   if (nested_in_vect_loop_p (loop, stmt_info))
6512     {
6513       loop = loop->inner;
6514       nested_cycle = true;
6515     }
6516
6517   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6518      element.  */
6519   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6520     {
6521       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6522       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6523     }
6524   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6525     gcc_assert (slp_node
6526                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6527
6528   /* 1. Is vectorizable reduction?  */
6529   /* Not supportable if the reduction variable is used in the loop, unless
6530      it's a reduction chain.  */
6531   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6532       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6533     return false;
6534
6535   /* Reductions that are not used even in an enclosing outer-loop,
6536      are expected to be "live" (used out of the loop).  */
6537   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6538       && !STMT_VINFO_LIVE_P (stmt_info))
6539     return false;
6540
6541   /* 2. Has this been recognized as a reduction pattern?
6542
6543      Check if STMT represents a pattern that has been recognized
6544      in earlier analysis stages.  For stmts that represent a pattern,
6545      the STMT_VINFO_RELATED_STMT field records the last stmt in
6546      the original sequence that constitutes the pattern.  */
6547
6548   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6549   if (orig_stmt_info)
6550     {
6551       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6552       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6553     }
6554
6555   /* 3. Check the operands of the operation.  The first operands are defined
6556         inside the loop body. The last operand is the reduction variable,
6557         which is defined by the loop-header-phi.  */
6558
6559   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6560   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6561   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6562   enum tree_code code = gimple_assign_rhs_code (stmt);
6563   bool lane_reduc_code_p
6564     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6565   int op_type = TREE_CODE_LENGTH (code);
6566
6567   scalar_dest = gimple_assign_lhs (stmt);
6568   scalar_type = TREE_TYPE (scalar_dest);
6569   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6570       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6571     return false;
6572
6573   /* Do not try to vectorize bit-precision reductions.  */
6574   if (!type_has_mode_precision_p (scalar_type))
6575     return false;
6576
6577   /* For lane-reducing ops we're reducing the number of reduction PHIs
6578      which means the only use of that may be in the lane-reducing operation.  */
6579   if (lane_reduc_code_p
6580       && reduc_chain_length != 1
6581       && !only_slp_reduc_chain)
6582     {
6583       if (dump_enabled_p ())
6584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585                          "lane-reducing reduction with extra stmts.\n");
6586       return false;
6587     }
6588
6589   /* All uses but the last are expected to be defined in the loop.
6590      The last use is the reduction variable.  In case of nested cycle this
6591      assumption is not true: we use reduc_index to record the index of the
6592      reduction variable.  */
6593   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6594   /* We need to skip an extra operand for COND_EXPRs with embedded
6595      comparison.  */
6596   unsigned opno_adjust = 0;
6597   if (code == COND_EXPR
6598       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6599     opno_adjust = 1;
6600   for (i = 0; i < op_type; i++)
6601     {
6602       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6603       if (i == 0 && code == COND_EXPR)
6604         continue;
6605
6606       stmt_vec_info def_stmt_info;
6607       enum vect_def_type dt;
6608       tree op;
6609       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6610                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6611                                &def_stmt_info))
6612         {
6613           if (dump_enabled_p ())
6614             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6615                              "use not simple.\n");
6616           return false;
6617         }
6618       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6619         continue;
6620
6621       /* There should be only one cycle def in the stmt, the one
6622          leading to reduc_def.  */
6623       if (VECTORIZABLE_CYCLE_DEF (dt))
6624         return false;
6625
6626       /* To properly compute ncopies we are interested in the widest
6627          non-reduction input type in case we're looking at a widening
6628          accumulation that we later handle in vect_transform_reduction.  */
6629       if (lane_reduc_code_p
6630           && tem
6631           && (!vectype_in
6632               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6633                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6634         vectype_in = tem;
6635
6636       if (code == COND_EXPR)
6637         {
6638           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6639           if (dt == vect_constant_def)
6640             {
6641               cond_reduc_dt = dt;
6642               cond_reduc_val = op;
6643             }
6644           if (dt == vect_induction_def
6645               && def_stmt_info
6646               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6647             {
6648               cond_reduc_dt = dt;
6649               cond_stmt_vinfo = def_stmt_info;
6650             }
6651         }
6652     }
6653   if (!vectype_in)
6654     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6655   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6656
6657   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6658   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6659   /* If we have a condition reduction, see if we can simplify it further.  */
6660   if (v_reduc_type == COND_REDUCTION)
6661     {
6662       if (slp_node)
6663         return false;
6664
6665       /* When the condition uses the reduction value in the condition, fail.  */
6666       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6667         {
6668           if (dump_enabled_p ())
6669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670                              "condition depends on previous iteration\n");
6671           return false;
6672         }
6673
6674       if (reduc_chain_length == 1
6675           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6676                                              vectype_in, OPTIMIZE_FOR_SPEED))
6677         {
6678           if (dump_enabled_p ())
6679             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680                              "optimizing condition reduction with"
6681                              " FOLD_EXTRACT_LAST.\n");
6682           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6683         }
6684       else if (cond_reduc_dt == vect_induction_def)
6685         {
6686           tree base
6687             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6688           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6689
6690           gcc_assert (TREE_CODE (base) == INTEGER_CST
6691                       && TREE_CODE (step) == INTEGER_CST);
6692           cond_reduc_val = NULL_TREE;
6693           enum tree_code cond_reduc_op_code = ERROR_MARK;
6694           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6695           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6696             ;
6697           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6698              above base; punt if base is the minimum value of the type for
6699              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6700           else if (tree_int_cst_sgn (step) == -1)
6701             {
6702               cond_reduc_op_code = MIN_EXPR;
6703               if (tree_int_cst_sgn (base) == -1)
6704                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6705               else if (tree_int_cst_lt (base,
6706                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6707                 cond_reduc_val
6708                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6709             }
6710           else
6711             {
6712               cond_reduc_op_code = MAX_EXPR;
6713               if (tree_int_cst_sgn (base) == 1)
6714                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6715               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6716                                         base))
6717                 cond_reduc_val
6718                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6719             }
6720           if (cond_reduc_val)
6721             {
6722               if (dump_enabled_p ())
6723                 dump_printf_loc (MSG_NOTE, vect_location,
6724                                  "condition expression based on "
6725                                  "integer induction.\n");
6726               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6727               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6728                 = cond_reduc_val;
6729               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6730             }
6731         }
6732       else if (cond_reduc_dt == vect_constant_def)
6733         {
6734           enum vect_def_type cond_initial_dt;
6735           tree cond_initial_val
6736             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6737
6738           gcc_assert (cond_reduc_val != NULL_TREE);
6739           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6740           if (cond_initial_dt == vect_constant_def
6741               && types_compatible_p (TREE_TYPE (cond_initial_val),
6742                                      TREE_TYPE (cond_reduc_val)))
6743             {
6744               tree e = fold_binary (LE_EXPR, boolean_type_node,
6745                                     cond_initial_val, cond_reduc_val);
6746               if (e && (integer_onep (e) || integer_zerop (e)))
6747                 {
6748                   if (dump_enabled_p ())
6749                     dump_printf_loc (MSG_NOTE, vect_location,
6750                                      "condition expression based on "
6751                                      "compile time constant.\n");
6752                   /* Record reduction code at analysis stage.  */
6753                   STMT_VINFO_REDUC_CODE (reduc_info)
6754                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6755                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6756                 }
6757             }
6758         }
6759     }
6760
6761   if (STMT_VINFO_LIVE_P (phi_info))
6762     return false;
6763
6764   if (slp_node)
6765     ncopies = 1;
6766   else
6767     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6768
6769   gcc_assert (ncopies >= 1);
6770
6771   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6772
6773   if (nested_cycle)
6774     {
6775       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6776                   == vect_double_reduction_def);
6777       double_reduc = true;
6778     }
6779
6780   /* 4.2. Check support for the epilog operation.
6781
6782           If STMT represents a reduction pattern, then the type of the
6783           reduction variable may be different than the type of the rest
6784           of the arguments.  For example, consider the case of accumulation
6785           of shorts into an int accumulator; The original code:
6786                         S1: int_a = (int) short_a;
6787           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6788
6789           was replaced with:
6790                         STMT: int_acc = widen_sum <short_a, int_acc>
6791
6792           This means that:
6793           1. The tree-code that is used to create the vector operation in the
6794              epilog code (that reduces the partial results) is not the
6795              tree-code of STMT, but is rather the tree-code of the original
6796              stmt from the pattern that STMT is replacing.  I.e, in the example
6797              above we want to use 'widen_sum' in the loop, but 'plus' in the
6798              epilog.
6799           2. The type (mode) we use to check available target support
6800              for the vector operation to be created in the *epilog*, is
6801              determined by the type of the reduction variable (in the example
6802              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6803              However the type (mode) we use to check available target support
6804              for the vector operation to be created *inside the loop*, is
6805              determined by the type of the other arguments to STMT (in the
6806              example we'd check this: optab_handler (widen_sum_optab,
6807              vect_short_mode)).
6808
6809           This is contrary to "regular" reductions, in which the types of all
6810           the arguments are the same as the type of the reduction variable.
6811           For "regular" reductions we can therefore use the same vector type
6812           (and also the same tree-code) when generating the epilog code and
6813           when generating the code inside the loop.  */
6814
6815   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6816   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6817
6818   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6819   if (reduction_type == TREE_CODE_REDUCTION)
6820     {
6821       /* Check whether it's ok to change the order of the computation.
6822          Generally, when vectorizing a reduction we change the order of the
6823          computation.  This may change the behavior of the program in some
6824          cases, so we need to check that this is ok.  One exception is when
6825          vectorizing an outer-loop: the inner-loop is executed sequentially,
6826          and therefore vectorizing reductions in the inner-loop during
6827          outer-loop vectorization is safe.  */
6828       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6829         {
6830           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6831              is not directy used in stmt.  */
6832           if (!only_slp_reduc_chain
6833               && reduc_chain_length != 1)
6834             {
6835               if (dump_enabled_p ())
6836                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837                                  "in-order reduction chain without SLP.\n");
6838               return false;
6839             }
6840           STMT_VINFO_REDUC_TYPE (reduc_info)
6841             = reduction_type = FOLD_LEFT_REDUCTION;
6842         }
6843       else if (!commutative_tree_code (orig_code)
6844                || !associative_tree_code (orig_code))
6845         {
6846           if (dump_enabled_p ())
6847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6848                             "reduction: not commutative/associative");
6849           return false;
6850         }
6851     }
6852
6853   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6854       && ncopies > 1)
6855     {
6856       if (dump_enabled_p ())
6857         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6858                          "multiple types in double reduction or condition "
6859                          "reduction or fold-left reduction.\n");
6860       return false;
6861     }
6862
6863   internal_fn reduc_fn = IFN_LAST;
6864   if (reduction_type == TREE_CODE_REDUCTION
6865       || reduction_type == FOLD_LEFT_REDUCTION
6866       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6867       || reduction_type == CONST_COND_REDUCTION)
6868     {
6869       if (reduction_type == FOLD_LEFT_REDUCTION
6870           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6871           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6872         {
6873           if (reduc_fn != IFN_LAST
6874               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6875                                                   OPTIMIZE_FOR_SPEED))
6876             {
6877               if (dump_enabled_p ())
6878                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6879                                  "reduc op not supported by target.\n");
6880
6881               reduc_fn = IFN_LAST;
6882             }
6883         }
6884       else
6885         {
6886           if (!nested_cycle || double_reduc)
6887             {
6888               if (dump_enabled_p ())
6889                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6890                                  "no reduc code for scalar code.\n");
6891
6892               return false;
6893             }
6894         }
6895     }
6896   else if (reduction_type == COND_REDUCTION)
6897     {
6898       int scalar_precision
6899         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6900       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6901       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6902                                                 nunits_out);
6903
6904       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6905                                           OPTIMIZE_FOR_SPEED))
6906         reduc_fn = IFN_REDUC_MAX;
6907     }
6908   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6909
6910   if (reduction_type != EXTRACT_LAST_REDUCTION
6911       && (!nested_cycle || double_reduc)
6912       && reduc_fn == IFN_LAST
6913       && !nunits_out.is_constant ())
6914     {
6915       if (dump_enabled_p ())
6916         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917                          "missing target support for reduction on"
6918                          " variable-length vectors.\n");
6919       return false;
6920     }
6921
6922   /* For SLP reductions, see if there is a neutral value we can use.  */
6923   tree neutral_op = NULL_TREE;
6924   if (slp_node)
6925     neutral_op = neutral_op_for_slp_reduction
6926       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6927        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6928
6929   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6930     {
6931       /* We can't support in-order reductions of code such as this:
6932
6933            for (int i = 0; i < n1; ++i)
6934              for (int j = 0; j < n2; ++j)
6935                l += a[j];
6936
6937          since GCC effectively transforms the loop when vectorizing:
6938
6939            for (int i = 0; i < n1 / VF; ++i)
6940              for (int j = 0; j < n2; ++j)
6941                for (int k = 0; k < VF; ++k)
6942                  l += a[j];
6943
6944          which is a reassociation of the original operation.  */
6945       if (dump_enabled_p ())
6946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6947                          "in-order double reduction not supported.\n");
6948
6949       return false;
6950     }
6951
6952   if (reduction_type == FOLD_LEFT_REDUCTION
6953       && slp_node
6954       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6955     {
6956       /* We cannot use in-order reductions in this case because there is
6957          an implicit reassociation of the operations involved.  */
6958       if (dump_enabled_p ())
6959         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6960                          "in-order unchained SLP reductions not supported.\n");
6961       return false;
6962     }
6963
6964   /* For double reductions, and for SLP reductions with a neutral value,
6965      we construct a variable-length initial vector by loading a vector
6966      full of the neutral value and then shift-and-inserting the start
6967      values into the low-numbered elements.  */
6968   if ((double_reduc || neutral_op)
6969       && !nunits_out.is_constant ()
6970       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6971                                           vectype_out, OPTIMIZE_FOR_SPEED))
6972     {
6973       if (dump_enabled_p ())
6974         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6975                          "reduction on variable-length vectors requires"
6976                          " target support for a vector-shift-and-insert"
6977                          " operation.\n");
6978       return false;
6979     }
6980
6981   /* Check extra constraints for variable-length unchained SLP reductions.  */
6982   if (STMT_SLP_TYPE (stmt_info)
6983       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6984       && !nunits_out.is_constant ())
6985     {
6986       /* We checked above that we could build the initial vector when
6987          there's a neutral element value.  Check here for the case in
6988          which each SLP statement has its own initial value and in which
6989          that value needs to be repeated for every instance of the
6990          statement within the initial vector.  */
6991       unsigned int group_size = SLP_TREE_LANES (slp_node);
6992       if (!neutral_op
6993           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6994                                               TREE_TYPE (vectype_out)))
6995         {
6996           if (dump_enabled_p ())
6997             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6998                              "unsupported form of SLP reduction for"
6999                              " variable-length vectors: cannot build"
7000                              " initial vector.\n");
7001           return false;
7002         }
7003       /* The epilogue code relies on the number of elements being a multiple
7004          of the group size.  The duplicate-and-interleave approach to setting
7005          up the initial vector does too.  */
7006       if (!multiple_p (nunits_out, group_size))
7007         {
7008           if (dump_enabled_p ())
7009             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7010                              "unsupported form of SLP reduction for"
7011                              " variable-length vectors: the vector size"
7012                              " is not a multiple of the number of results.\n");
7013           return false;
7014         }
7015     }
7016
7017   if (reduction_type == COND_REDUCTION)
7018     {
7019       widest_int ni;
7020
7021       if (! max_loop_iterations (loop, &ni))
7022         {
7023           if (dump_enabled_p ())
7024             dump_printf_loc (MSG_NOTE, vect_location,
7025                              "loop count not known, cannot create cond "
7026                              "reduction.\n");
7027           return false;
7028         }
7029       /* Convert backedges to iterations.  */
7030       ni += 1;
7031
7032       /* The additional index will be the same type as the condition.  Check
7033          that the loop can fit into this less one (because we'll use up the
7034          zero slot for when there are no matches).  */
7035       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7036       if (wi::geu_p (ni, wi::to_widest (max_index)))
7037         {
7038           if (dump_enabled_p ())
7039             dump_printf_loc (MSG_NOTE, vect_location,
7040                              "loop size is greater than data size.\n");
7041           return false;
7042         }
7043     }
7044
7045   /* In case the vectorization factor (VF) is bigger than the number
7046      of elements that we can fit in a vectype (nunits), we have to generate
7047      more than one vector stmt - i.e - we need to "unroll" the
7048      vector stmt by a factor VF/nunits.  For more details see documentation
7049      in vectorizable_operation.  */
7050
7051   /* If the reduction is used in an outer loop we need to generate
7052      VF intermediate results, like so (e.g. for ncopies=2):
7053         r0 = phi (init, r0)
7054         r1 = phi (init, r1)
7055         r0 = x0 + r0;
7056         r1 = x1 + r1;
7057     (i.e. we generate VF results in 2 registers).
7058     In this case we have a separate def-use cycle for each copy, and therefore
7059     for each copy we get the vector def for the reduction variable from the
7060     respective phi node created for this copy.
7061
7062     Otherwise (the reduction is unused in the loop nest), we can combine
7063     together intermediate results, like so (e.g. for ncopies=2):
7064         r = phi (init, r)
7065         r = x0 + r;
7066         r = x1 + r;
7067    (i.e. we generate VF/2 results in a single register).
7068    In this case for each copy we get the vector def for the reduction variable
7069    from the vectorized reduction operation generated in the previous iteration.
7070
7071    This only works when we see both the reduction PHI and its only consumer
7072    in vectorizable_reduction and there are no intermediate stmts
7073    participating.  */
7074   if (ncopies > 1
7075       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7076       && reduc_chain_length == 1)
7077     single_defuse_cycle = true;
7078
7079   if (single_defuse_cycle || lane_reduc_code_p)
7080     {
7081       gcc_assert (code != COND_EXPR);
7082
7083       /* 4. Supportable by target?  */
7084       bool ok = true;
7085
7086       /* 4.1. check support for the operation in the loop  */
7087       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7088       if (!optab)
7089         {
7090           if (dump_enabled_p ())
7091             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7092                              "no optab.\n");
7093           ok = false;
7094         }
7095
7096       machine_mode vec_mode = TYPE_MODE (vectype_in);
7097       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7098         {
7099           if (dump_enabled_p ())
7100             dump_printf (MSG_NOTE, "op not supported by target.\n");
7101           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7102               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7103             ok = false;
7104           else
7105             if (dump_enabled_p ())
7106               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7107         }
7108
7109       /* Worthwhile without SIMD support?  */
7110       if (ok
7111           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7112           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7113         {
7114           if (dump_enabled_p ())
7115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7116                              "not worthwhile without SIMD support.\n");
7117           ok = false;
7118         }
7119
7120       /* lane-reducing operations have to go through vect_transform_reduction.
7121          For the other cases try without the single cycle optimization.  */
7122       if (!ok)
7123         {
7124           if (lane_reduc_code_p)
7125             return false;
7126           else
7127             single_defuse_cycle = false;
7128         }
7129     }
7130   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7131
7132   /* If the reduction stmt is one of the patterns that have lane
7133      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7134   if ((ncopies > 1 && ! single_defuse_cycle)
7135       && lane_reduc_code_p)
7136     {
7137       if (dump_enabled_p ())
7138         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139                          "multi def-use cycle not possible for lane-reducing "
7140                          "reduction operation\n");
7141       return false;
7142     }
7143
7144   if (slp_node
7145       && !(!single_defuse_cycle
7146            && code != DOT_PROD_EXPR
7147            && code != WIDEN_SUM_EXPR
7148            && code != SAD_EXPR
7149            && reduction_type != FOLD_LEFT_REDUCTION))
7150     for (i = 0; i < op_type; i++)
7151       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7152         {
7153           if (dump_enabled_p ())
7154             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7155                              "incompatible vector types for invariants\n");
7156           return false;
7157         }
7158
7159   if (slp_node)
7160     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7161   else
7162     vec_num = 1;
7163
7164   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7165                              reduction_type, ncopies, cost_vec);
7166   if (dump_enabled_p ()
7167       && reduction_type == FOLD_LEFT_REDUCTION)
7168     dump_printf_loc (MSG_NOTE, vect_location,
7169                      "using an in-order (fold-left) reduction.\n");
7170   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7171   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7172      reductions go through their own vectorizable_* routines.  */
7173   if (!single_defuse_cycle
7174       && code != DOT_PROD_EXPR
7175       && code != WIDEN_SUM_EXPR
7176       && code != SAD_EXPR
7177       && reduction_type != FOLD_LEFT_REDUCTION)
7178     {
7179       stmt_vec_info tem
7180         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7181       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7182         {
7183           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7184           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7185         }
7186       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7187       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7188     }
7189   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7190     {
7191       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7192       internal_fn cond_fn = get_conditional_internal_fn (code);
7193
7194       if (reduction_type != FOLD_LEFT_REDUCTION
7195           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7196           && (cond_fn == IFN_LAST
7197               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7198                                                   OPTIMIZE_FOR_SPEED)))
7199         {
7200           if (dump_enabled_p ())
7201             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7202                              "can't operate on partial vectors because"
7203                              " no conditional operation is available.\n");
7204           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7205         }
7206       else if (reduction_type == FOLD_LEFT_REDUCTION
7207                && reduc_fn == IFN_LAST
7208                && !expand_vec_cond_expr_p (vectype_in,
7209                                            truth_type_for (vectype_in),
7210                                            SSA_NAME))
7211         {
7212           if (dump_enabled_p ())
7213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7214                              "can't operate on partial vectors because"
7215                              " no conditional operation is available.\n");
7216           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7217         }
7218       else
7219         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7220                                vectype_in, NULL);
7221     }
7222   return true;
7223 }
7224
7225 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7226    value.  */
7227
7228 bool
7229 vect_transform_reduction (loop_vec_info loop_vinfo,
7230                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7231                           gimple **vec_stmt, slp_tree slp_node)
7232 {
7233   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7234   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7235   int i;
7236   int ncopies;
7237   int vec_num;
7238
7239   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7240   gcc_assert (reduc_info->is_reduc_info);
7241
7242   if (nested_in_vect_loop_p (loop, stmt_info))
7243     {
7244       loop = loop->inner;
7245       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7246     }
7247
7248   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7249   enum tree_code code = gimple_assign_rhs_code (stmt);
7250   int op_type = TREE_CODE_LENGTH (code);
7251
7252   /* Flatten RHS.  */
7253   tree ops[3];
7254   switch (get_gimple_rhs_class (code))
7255     {
7256     case GIMPLE_TERNARY_RHS:
7257       ops[2] = gimple_assign_rhs3 (stmt);
7258       /* Fall thru.  */
7259     case GIMPLE_BINARY_RHS:
7260       ops[0] = gimple_assign_rhs1 (stmt);
7261       ops[1] = gimple_assign_rhs2 (stmt);
7262       break;
7263     default:
7264       gcc_unreachable ();
7265     }
7266
7267   /* All uses but the last are expected to be defined in the loop.
7268      The last use is the reduction variable.  In case of nested cycle this
7269      assumption is not true: we use reduc_index to record the index of the
7270      reduction variable.  */
7271   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7272   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7273   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7274   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7275
7276   if (slp_node)
7277     {
7278       ncopies = 1;
7279       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7280     }
7281   else
7282     {
7283       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7284       vec_num = 1;
7285     }
7286
7287   internal_fn cond_fn = get_conditional_internal_fn (code);
7288   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7289   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7290
7291   /* Transform.  */
7292   tree new_temp = NULL_TREE;
7293   auto_vec<tree> vec_oprnds0;
7294   auto_vec<tree> vec_oprnds1;
7295   auto_vec<tree> vec_oprnds2;
7296   tree def0;
7297
7298   if (dump_enabled_p ())
7299     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7300
7301   /* FORNOW: Multiple types are not supported for condition.  */
7302   if (code == COND_EXPR)
7303     gcc_assert (ncopies == 1);
7304
7305   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7306
7307   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7308   if (reduction_type == FOLD_LEFT_REDUCTION)
7309     {
7310       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7311       return vectorize_fold_left_reduction
7312           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7313            reduc_fn, ops, vectype_in, reduc_index, masks);
7314     }
7315
7316   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7317   gcc_assert (single_defuse_cycle
7318               || code == DOT_PROD_EXPR
7319               || code == WIDEN_SUM_EXPR
7320               || code == SAD_EXPR);
7321
7322   /* Create the destination vector  */
7323   tree scalar_dest = gimple_assign_lhs (stmt);
7324   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7325
7326   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7327                      single_defuse_cycle && reduc_index == 0
7328                      ? NULL_TREE : ops[0], &vec_oprnds0,
7329                      single_defuse_cycle && reduc_index == 1
7330                      ? NULL_TREE : ops[1], &vec_oprnds1,
7331                      op_type == ternary_op
7332                      && !(single_defuse_cycle && reduc_index == 2)
7333                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7334   if (single_defuse_cycle)
7335     {
7336       gcc_assert (!slp_node);
7337       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7338                                      ops[reduc_index],
7339                                      reduc_index == 0 ? &vec_oprnds0
7340                                      : (reduc_index == 1 ? &vec_oprnds1
7341                                         : &vec_oprnds2));
7342     }
7343
7344   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7345     {
7346       gimple *new_stmt;
7347       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7348       if (masked_loop_p && !mask_by_cond_expr)
7349         {
7350           /* Make sure that the reduction accumulator is vop[0].  */
7351           if (reduc_index == 1)
7352             {
7353               gcc_assert (commutative_tree_code (code));
7354               std::swap (vop[0], vop[1]);
7355             }
7356           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7357                                           vectype_in, i);
7358           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7359                                                     vop[0], vop[1], vop[0]);
7360           new_temp = make_ssa_name (vec_dest, call);
7361           gimple_call_set_lhs (call, new_temp);
7362           gimple_call_set_nothrow (call, true);
7363           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7364           new_stmt = call;
7365         }
7366       else
7367         {
7368           if (op_type == ternary_op)
7369             vop[2] = vec_oprnds2[i];
7370
7371           if (masked_loop_p && mask_by_cond_expr)
7372             {
7373               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7374                                               vectype_in, i);
7375               build_vect_cond_expr (code, vop, mask, gsi);
7376             }
7377
7378           new_stmt = gimple_build_assign (vec_dest, code,
7379                                           vop[0], vop[1], vop[2]);
7380           new_temp = make_ssa_name (vec_dest, new_stmt);
7381           gimple_assign_set_lhs (new_stmt, new_temp);
7382           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7383         }
7384
7385       if (slp_node)
7386         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7387       else if (single_defuse_cycle
7388                && i < ncopies - 1)
7389         {
7390           if (reduc_index == 0)
7391             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7392           else if (reduc_index == 1)
7393             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7394           else if (reduc_index == 2)
7395             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7396         }
7397       else
7398         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7399     }
7400
7401   if (!slp_node)
7402     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7403
7404   return true;
7405 }
7406
7407 /* Transform phase of a cycle PHI.  */
7408
7409 bool
7410 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7411                           stmt_vec_info stmt_info, gimple **vec_stmt,
7412                           slp_tree slp_node, slp_instance slp_node_instance)
7413 {
7414   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7415   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7416   int i;
7417   int ncopies;
7418   int j;
7419   bool nested_cycle = false;
7420   int vec_num;
7421
7422   if (nested_in_vect_loop_p (loop, stmt_info))
7423     {
7424       loop = loop->inner;
7425       nested_cycle = true;
7426     }
7427
7428   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7429   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7430   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7431   gcc_assert (reduc_info->is_reduc_info);
7432
7433   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7434       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7435     /* Leave the scalar phi in place.  */
7436     return true;
7437
7438   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7439   /* For a nested cycle we do not fill the above.  */
7440   if (!vectype_in)
7441     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7442   gcc_assert (vectype_in);
7443
7444   if (slp_node)
7445     {
7446       /* The size vect_schedule_slp_instance computes is off for us.  */
7447       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7448                                       * SLP_TREE_LANES (slp_node), vectype_in);
7449       ncopies = 1;
7450     }
7451   else
7452     {
7453       vec_num = 1;
7454       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7455     }
7456
7457   /* Check whether we should use a single PHI node and accumulate
7458      vectors to one before the backedge.  */
7459   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7460     ncopies = 1;
7461
7462   /* Create the destination vector  */
7463   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7464   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7465                                                vectype_out);
7466
7467   /* Get the loop-entry arguments.  */
7468   tree vec_initial_def;
7469   auto_vec<tree> vec_initial_defs;
7470   if (slp_node)
7471     {
7472       vec_initial_defs.reserve (vec_num);
7473       if (nested_cycle)
7474         {
7475           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7476           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7477                              &vec_initial_defs);
7478         }
7479       else
7480         {
7481           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7482           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7483           tree neutral_op
7484               = neutral_op_for_slp_reduction (slp_node, vectype_out,
7485                                               STMT_VINFO_REDUC_CODE (reduc_info),
7486                                               first != NULL);
7487           get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7488                                           &vec_initial_defs, vec_num,
7489                                           first != NULL, neutral_op);
7490         }
7491     }
7492   else
7493     {
7494       /* Get at the scalar def before the loop, that defines the initial
7495          value of the reduction variable.  */
7496       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7497                                                 loop_preheader_edge (loop));
7498       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7499          and we can't use zero for induc_val, use initial_def.  Similarly
7500          for REDUC_MIN and initial_def larger than the base.  */
7501       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7502         {
7503           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7504           if (TREE_CODE (initial_def) == INTEGER_CST
7505               && !integer_zerop (induc_val)
7506               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7507                    && tree_int_cst_lt (initial_def, induc_val))
7508                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7509                       && tree_int_cst_lt (induc_val, initial_def))))
7510             {
7511               induc_val = initial_def;
7512               /* Communicate we used the initial_def to epilouge
7513                  generation.  */
7514               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7515             }
7516           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7517           vec_initial_defs.create (ncopies);
7518           for (i = 0; i < ncopies; ++i)
7519             vec_initial_defs.quick_push (vec_initial_def);
7520         }
7521       else if (nested_cycle)
7522         {
7523           /* Do not use an adjustment def as that case is not supported
7524              correctly if ncopies is not one.  */
7525           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7526                                          ncopies, initial_def,
7527                                          &vec_initial_defs);
7528         }
7529       else
7530         {
7531           tree adjustment_def = NULL_TREE;
7532           tree *adjustment_defp = &adjustment_def;
7533           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7534           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7535             adjustment_defp = NULL;
7536           vec_initial_def
7537             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7538                                              initial_def, adjustment_defp);
7539           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7540           vec_initial_defs.create (ncopies);
7541           for (i = 0; i < ncopies; ++i)
7542             vec_initial_defs.quick_push (vec_initial_def);
7543         }
7544     }
7545
7546   /* Generate the reduction PHIs upfront.  */
7547   for (i = 0; i < vec_num; i++)
7548     {
7549       tree vec_init_def = vec_initial_defs[i];
7550       for (j = 0; j < ncopies; j++)
7551         {
7552           /* Create the reduction-phi that defines the reduction
7553              operand.  */
7554           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7555
7556           /* Set the loop-entry arg of the reduction-phi.  */
7557           if (j != 0 && nested_cycle)
7558             vec_init_def = vec_initial_defs[j];
7559           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7560                        UNKNOWN_LOCATION);
7561
7562           /* The loop-latch arg is set in epilogue processing.  */
7563
7564           if (slp_node)
7565             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7566           else
7567             {
7568               if (j == 0)
7569                 *vec_stmt = new_phi;
7570               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7571             }
7572         }
7573     }
7574
7575   return true;
7576 }
7577
7578 /* Vectorizes LC PHIs.  */
7579
7580 bool
7581 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7582                      stmt_vec_info stmt_info, gimple **vec_stmt,
7583                      slp_tree slp_node)
7584 {
7585   if (!loop_vinfo
7586       || !is_a <gphi *> (stmt_info->stmt)
7587       || gimple_phi_num_args (stmt_info->stmt) != 1)
7588     return false;
7589
7590   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7591       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7592     return false;
7593
7594   if (!vec_stmt) /* transformation not required.  */
7595     {
7596       /* Deal with copies from externs or constants that disguise as
7597          loop-closed PHI nodes (PR97886).  */
7598       if (slp_node
7599           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7600                                                 SLP_TREE_VECTYPE (slp_node)))
7601         {
7602           if (dump_enabled_p ())
7603             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604                              "incompatible vector types for invariants\n");
7605           return false;
7606         }
7607       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7608       return true;
7609     }
7610
7611   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7612   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7613   basic_block bb = gimple_bb (stmt_info->stmt);
7614   edge e = single_pred_edge (bb);
7615   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7616   auto_vec<tree> vec_oprnds;
7617   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7618                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7619                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7620   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7621     {
7622       /* Create the vectorized LC PHI node.  */
7623       gphi *new_phi = create_phi_node (vec_dest, bb);
7624       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7625       if (slp_node)
7626         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7627       else
7628         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7629     }
7630   if (!slp_node)
7631     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7632
7633   return true;
7634 }
7635
7636 /* Vectorizes PHIs.  */
7637
7638 bool
7639 vectorizable_phi (vec_info *,
7640                   stmt_vec_info stmt_info, gimple **vec_stmt,
7641                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7642 {
7643   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7644     return false;
7645
7646   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7647     return false;
7648
7649   tree vectype = SLP_TREE_VECTYPE (slp_node);
7650
7651   if (!vec_stmt) /* transformation not required.  */
7652     {
7653       slp_tree child;
7654       unsigned i;
7655       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7656         if (!child)
7657           {
7658             if (dump_enabled_p ())
7659               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660                                "PHI node with unvectorized backedge def\n");
7661             return false;
7662           }
7663         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7664           {
7665             if (dump_enabled_p ())
7666               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7667                                "incompatible vector types for invariants\n");
7668             return false;
7669           }
7670       record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7671                         vector_stmt, stmt_info, vectype, 0, vect_body);
7672       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7673       return true;
7674     }
7675
7676   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7677   basic_block bb = gimple_bb (stmt_info->stmt);
7678   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7679   auto_vec<gphi *> new_phis;
7680   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7681     {
7682       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7683
7684       /* Skip not yet vectorized defs.  */
7685       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7686           && SLP_TREE_VEC_STMTS (child).is_empty ())
7687         continue;
7688
7689       auto_vec<tree> vec_oprnds;
7690       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7691       if (!new_phis.exists ())
7692         {
7693           new_phis.create (vec_oprnds.length ());
7694           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7695             {
7696               /* Create the vectorized LC PHI node.  */
7697               new_phis.quick_push (create_phi_node (vec_dest, bb));
7698               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7699             }
7700         }
7701       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7702       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7703         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7704     }
7705   /* We should have at least one already vectorized child.  */
7706   gcc_assert (new_phis.exists ());
7707
7708   return true;
7709 }
7710
7711
7712 /* Function vect_min_worthwhile_factor.
7713
7714    For a loop where we could vectorize the operation indicated by CODE,
7715    return the minimum vectorization factor that makes it worthwhile
7716    to use generic vectors.  */
7717 static unsigned int
7718 vect_min_worthwhile_factor (enum tree_code code)
7719 {
7720   switch (code)
7721     {
7722     case PLUS_EXPR:
7723     case MINUS_EXPR:
7724     case NEGATE_EXPR:
7725       return 4;
7726
7727     case BIT_AND_EXPR:
7728     case BIT_IOR_EXPR:
7729     case BIT_XOR_EXPR:
7730     case BIT_NOT_EXPR:
7731       return 2;
7732
7733     default:
7734       return INT_MAX;
7735     }
7736 }
7737
7738 /* Return true if VINFO indicates we are doing loop vectorization and if
7739    it is worth decomposing CODE operations into scalar operations for
7740    that loop's vectorization factor.  */
7741
7742 bool
7743 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7744 {
7745   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7746   unsigned HOST_WIDE_INT value;
7747   return (loop_vinfo
7748           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7749           && value >= vect_min_worthwhile_factor (code));
7750 }
7751
7752 /* Function vectorizable_induction
7753
7754    Check if STMT_INFO performs an induction computation that can be vectorized.
7755    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7756    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7757    Return true if STMT_INFO is vectorizable in this way.  */
7758
7759 bool
7760 vectorizable_induction (loop_vec_info loop_vinfo,
7761                         stmt_vec_info stmt_info,
7762                         gimple **vec_stmt, slp_tree slp_node,
7763                         stmt_vector_for_cost *cost_vec)
7764 {
7765   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7766   unsigned ncopies;
7767   bool nested_in_vect_loop = false;
7768   class loop *iv_loop;
7769   tree vec_def;
7770   edge pe = loop_preheader_edge (loop);
7771   basic_block new_bb;
7772   tree new_vec, vec_init, vec_step, t;
7773   tree new_name;
7774   gimple *new_stmt;
7775   gphi *induction_phi;
7776   tree induc_def, vec_dest;
7777   tree init_expr, step_expr;
7778   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7779   unsigned i;
7780   tree expr;
7781   gimple_stmt_iterator si;
7782
7783   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7784   if (!phi)
7785     return false;
7786
7787   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7788     return false;
7789
7790   /* Make sure it was recognized as induction computation.  */
7791   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7792     return false;
7793
7794   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7795   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7796
7797   if (slp_node)
7798     ncopies = 1;
7799   else
7800     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7801   gcc_assert (ncopies >= 1);
7802
7803   /* FORNOW. These restrictions should be relaxed.  */
7804   if (nested_in_vect_loop_p (loop, stmt_info))
7805     {
7806       imm_use_iterator imm_iter;
7807       use_operand_p use_p;
7808       gimple *exit_phi;
7809       edge latch_e;
7810       tree loop_arg;
7811
7812       if (ncopies > 1)
7813         {
7814           if (dump_enabled_p ())
7815             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7816                              "multiple types in nested loop.\n");
7817           return false;
7818         }
7819
7820       exit_phi = NULL;
7821       latch_e = loop_latch_edge (loop->inner);
7822       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7823       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7824         {
7825           gimple *use_stmt = USE_STMT (use_p);
7826           if (is_gimple_debug (use_stmt))
7827             continue;
7828
7829           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7830             {
7831               exit_phi = use_stmt;
7832               break;
7833             }
7834         }
7835       if (exit_phi)
7836         {
7837           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7838           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7839                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7840             {
7841               if (dump_enabled_p ())
7842                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7843                                  "inner-loop induction only used outside "
7844                                  "of the outer vectorized loop.\n");
7845               return false;
7846             }
7847         }
7848
7849       nested_in_vect_loop = true;
7850       iv_loop = loop->inner;
7851     }
7852   else
7853     iv_loop = loop;
7854   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7855
7856   if (slp_node && !nunits.is_constant ())
7857     {
7858       /* The current SLP code creates the step value element-by-element.  */
7859       if (dump_enabled_p ())
7860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7861                          "SLP induction not supported for variable-length"
7862                          " vectors.\n");
7863       return false;
7864     }
7865
7866   if (!vec_stmt) /* transformation not required.  */
7867     {
7868       unsigned inside_cost = 0, prologue_cost = 0;
7869       if (slp_node)
7870         {
7871           /* We eventually need to set a vector type on invariant
7872              arguments.  */
7873           unsigned j;
7874           slp_tree child;
7875           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7876             if (!vect_maybe_update_slp_op_vectype
7877                 (child, SLP_TREE_VECTYPE (slp_node)))
7878               {
7879                 if (dump_enabled_p ())
7880                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7881                                    "incompatible vector types for "
7882                                    "invariants\n");
7883                 return false;
7884               }
7885           /* loop cost for vec_loop.  */
7886           inside_cost
7887             = record_stmt_cost (cost_vec,
7888                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7889                                 vector_stmt, stmt_info, 0, vect_body);
7890           /* prologue cost for vec_init (if not nested) and step.  */
7891           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7892                                             scalar_to_vec,
7893                                             stmt_info, 0, vect_prologue);
7894         }
7895       else /* if (!slp_node) */
7896         {
7897           /* loop cost for vec_loop.  */
7898           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7899                                           stmt_info, 0, vect_body);
7900           /* prologue cost for vec_init and vec_step.  */
7901           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7902                                             stmt_info, 0, vect_prologue);
7903         }
7904       if (dump_enabled_p ())
7905         dump_printf_loc (MSG_NOTE, vect_location,
7906                          "vect_model_induction_cost: inside_cost = %d, "
7907                          "prologue_cost = %d .\n", inside_cost,
7908                          prologue_cost);
7909
7910       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7911       DUMP_VECT_SCOPE ("vectorizable_induction");
7912       return true;
7913     }
7914
7915   /* Transform.  */
7916
7917   /* Compute a vector variable, initialized with the first VF values of
7918      the induction variable.  E.g., for an iv with IV_PHI='X' and
7919      evolution S, for a vector of 4 units, we want to compute:
7920      [X, X + S, X + 2*S, X + 3*S].  */
7921
7922   if (dump_enabled_p ())
7923     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7924
7925   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7926   gcc_assert (step_expr != NULL_TREE);
7927   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7928
7929   pe = loop_preheader_edge (iv_loop);
7930   /* Find the first insertion point in the BB.  */
7931   basic_block bb = gimple_bb (phi);
7932   si = gsi_after_labels (bb);
7933
7934   /* For SLP induction we have to generate several IVs as for example
7935      with group size 3 we need
7936        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7937        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
7938   if (slp_node)
7939     {
7940       /* Enforced above.  */
7941       unsigned int const_nunits = nunits.to_constant ();
7942
7943       /* The initial values are vectorized, but any lanes > group_size
7944          need adjustment.  */
7945       slp_tree init_node
7946         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7947
7948       /* Gather steps.  Since we do not vectorize inductions as
7949          cycles we have to reconstruct the step from SCEV data.  */
7950       unsigned group_size = SLP_TREE_LANES (slp_node);
7951       tree *steps = XALLOCAVEC (tree, group_size);
7952       tree *inits = XALLOCAVEC (tree, group_size);
7953       stmt_vec_info phi_info;
7954       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7955         {
7956           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7957           if (!init_node)
7958             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7959                                            pe->dest_idx);
7960         }
7961
7962       /* Now generate the IVs.  */
7963       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7964       gcc_assert ((const_nunits * nvects) % group_size == 0);
7965       unsigned nivs;
7966       if (nested_in_vect_loop)
7967         nivs = nvects;
7968       else
7969         {
7970           /* Compute the number of distinct IVs we need.  First reduce
7971              group_size if it is a multiple of const_nunits so we get
7972              one IV for a group_size of 4 but const_nunits 2.  */
7973           unsigned group_sizep = group_size;
7974           if (group_sizep % const_nunits == 0)
7975             group_sizep = group_sizep / const_nunits;
7976           nivs = least_common_multiple (group_sizep,
7977                                         const_nunits) / const_nunits;
7978         }
7979       tree stept = TREE_TYPE (step_vectype);
7980       tree lupdate_mul = NULL_TREE;
7981       if (!nested_in_vect_loop)
7982         {
7983           /* The number of iterations covered in one vector iteration.  */
7984           unsigned lup_mul = (nvects * const_nunits) / group_size;
7985           lupdate_mul
7986             = build_vector_from_val (step_vectype,
7987                                      SCALAR_FLOAT_TYPE_P (stept)
7988                                      ? build_real_from_wide (stept, lup_mul,
7989                                                              UNSIGNED)
7990                                      : build_int_cstu (stept, lup_mul));
7991         }
7992       tree peel_mul = NULL_TREE;
7993       gimple_seq init_stmts = NULL;
7994       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
7995         {
7996           if (SCALAR_FLOAT_TYPE_P (stept))
7997             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
7998                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7999           else
8000             peel_mul = gimple_convert (&init_stmts, stept,
8001                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8002           peel_mul = gimple_build_vector_from_val (&init_stmts,
8003                                                    step_vectype, peel_mul);
8004         }
8005       unsigned ivn;
8006       auto_vec<tree> vec_steps;
8007       for (ivn = 0; ivn < nivs; ++ivn)
8008         {
8009           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8010           tree_vector_builder init_elts (vectype, const_nunits, 1);
8011           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8012           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8013             {
8014               /* The scalar steps of the IVs.  */
8015               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8016               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8017               step_elts.quick_push (elt);
8018               if (!init_node)
8019                 {
8020                   /* The scalar inits of the IVs if not vectorized.  */
8021                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8022                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8023                                                   TREE_TYPE (elt)))
8024                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8025                                         TREE_TYPE (vectype), elt);
8026                   init_elts.quick_push (elt);
8027                 }
8028               /* The number of steps to add to the initial values.  */
8029               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8030               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8031                                    ? build_real_from_wide (stept,
8032                                                            mul_elt, UNSIGNED)
8033                                    : build_int_cstu (stept, mul_elt));
8034             }
8035           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8036           vec_steps.safe_push (vec_step);
8037           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8038           if (peel_mul)
8039             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8040                                      step_mul, peel_mul);
8041           if (!init_node)
8042             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8043
8044           /* Create the induction-phi that defines the induction-operand.  */
8045           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8046                                             "vec_iv_");
8047           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8048           induc_def = PHI_RESULT (induction_phi);
8049
8050           /* Create the iv update inside the loop  */
8051           tree up = vec_step;
8052           if (lupdate_mul)
8053             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8054                                vec_step, lupdate_mul);
8055           gimple_seq stmts = NULL;
8056           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8057           vec_def = gimple_build (&stmts,
8058                                   PLUS_EXPR, step_vectype, vec_def, up);
8059           vec_def = gimple_convert (&stmts, vectype, vec_def);
8060           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8061           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8062                        UNKNOWN_LOCATION);
8063
8064           if (init_node)
8065             vec_init = vect_get_slp_vect_def (init_node, ivn);
8066           if (!nested_in_vect_loop
8067               && !integer_zerop (step_mul))
8068             {
8069               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8070               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8071                                  vec_step, step_mul);
8072               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8073                                       vec_def, up);
8074               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8075             }
8076
8077           /* Set the arguments of the phi node:  */
8078           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8079
8080           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8081         }
8082       if (!nested_in_vect_loop)
8083         {
8084           /* Fill up to the number of vectors we need for the whole group.  */
8085           nivs = least_common_multiple (group_size,
8086                                         const_nunits) / const_nunits;
8087           for (; ivn < nivs; ++ivn)
8088             {
8089               SLP_TREE_VEC_STMTS (slp_node)
8090                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8091               vec_steps.safe_push (vec_steps[0]);
8092             }
8093         }
8094
8095       /* Re-use IVs when we can.  We are generating further vector
8096          stmts by adding VF' * stride to the IVs generated above.  */
8097       if (ivn < nvects)
8098         {
8099           unsigned vfp
8100             = least_common_multiple (group_size, const_nunits) / group_size;
8101           tree lupdate_mul
8102             = build_vector_from_val (step_vectype,
8103                                      SCALAR_FLOAT_TYPE_P (stept)
8104                                      ? build_real_from_wide (stept,
8105                                                              vfp, UNSIGNED)
8106                                      : build_int_cstu (stept, vfp));
8107           for (; ivn < nvects; ++ivn)
8108             {
8109               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8110               tree def = gimple_get_lhs (iv);
8111               if (ivn < 2*nivs)
8112                 vec_steps[ivn - nivs]
8113                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8114                                   vec_steps[ivn - nivs], lupdate_mul);
8115               gimple_seq stmts = NULL;
8116               def = gimple_convert (&stmts, step_vectype, def);
8117               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8118                                   def, vec_steps[ivn % nivs]);
8119               def = gimple_convert (&stmts, vectype, def);
8120               if (gimple_code (iv) == GIMPLE_PHI)
8121                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8122               else
8123                 {
8124                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8125                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8126                 }
8127               SLP_TREE_VEC_STMTS (slp_node)
8128                 .quick_push (SSA_NAME_DEF_STMT (def));
8129             }
8130         }
8131
8132       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8133       gcc_assert (!new_bb);
8134
8135       return true;
8136     }
8137
8138   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8139                                      loop_preheader_edge (iv_loop));
8140
8141   gimple_seq stmts = NULL;
8142   if (!nested_in_vect_loop)
8143     {
8144       /* Convert the initial value to the IV update type.  */
8145       tree new_type = TREE_TYPE (step_expr);
8146       init_expr = gimple_convert (&stmts, new_type, init_expr);
8147
8148       /* If we are using the loop mask to "peel" for alignment then we need
8149          to adjust the start value here.  */
8150       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8151       if (skip_niters != NULL_TREE)
8152         {
8153           if (FLOAT_TYPE_P (vectype))
8154             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8155                                         skip_niters);
8156           else
8157             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8158           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8159                                          skip_niters, step_expr);
8160           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8161                                     init_expr, skip_step);
8162         }
8163     }
8164
8165   if (stmts)
8166     {
8167       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8168       gcc_assert (!new_bb);
8169     }
8170
8171   /* Create the vector that holds the initial_value of the induction.  */
8172   if (nested_in_vect_loop)
8173     {
8174       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8175          been created during vectorization of previous stmts.  We obtain it
8176          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8177       auto_vec<tree> vec_inits;
8178       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8179                                      init_expr, &vec_inits);
8180       vec_init = vec_inits[0];
8181       /* If the initial value is not of proper type, convert it.  */
8182       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8183         {
8184           new_stmt
8185             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8186                                                           vect_simple_var,
8187                                                           "vec_iv_"),
8188                                    VIEW_CONVERT_EXPR,
8189                                    build1 (VIEW_CONVERT_EXPR, vectype,
8190                                            vec_init));
8191           vec_init = gimple_assign_lhs (new_stmt);
8192           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8193                                                  new_stmt);
8194           gcc_assert (!new_bb);
8195         }
8196     }
8197   else
8198     {
8199       /* iv_loop is the loop to be vectorized. Create:
8200          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8201       stmts = NULL;
8202       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8203
8204       unsigned HOST_WIDE_INT const_nunits;
8205       if (nunits.is_constant (&const_nunits))
8206         {
8207           tree_vector_builder elts (step_vectype, const_nunits, 1);
8208           elts.quick_push (new_name);
8209           for (i = 1; i < const_nunits; i++)
8210             {
8211               /* Create: new_name_i = new_name + step_expr  */
8212               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8213                                        new_name, step_expr);
8214               elts.quick_push (new_name);
8215             }
8216           /* Create a vector from [new_name_0, new_name_1, ...,
8217              new_name_nunits-1]  */
8218           vec_init = gimple_build_vector (&stmts, &elts);
8219         }
8220       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8221         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8222         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8223                                  new_name, step_expr);
8224       else
8225         {
8226           /* Build:
8227                 [base, base, base, ...]
8228                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8229           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8230           gcc_assert (flag_associative_math);
8231           tree index = build_index_vector (step_vectype, 0, 1);
8232           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8233                                                         new_name);
8234           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8235                                                         step_expr);
8236           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8237           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8238                                    vec_init, step_vec);
8239           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8240                                    vec_init, base_vec);
8241         }
8242       vec_init = gimple_convert (&stmts, vectype, vec_init);
8243
8244       if (stmts)
8245         {
8246           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8247           gcc_assert (!new_bb);
8248         }
8249     }
8250
8251
8252   /* Create the vector that holds the step of the induction.  */
8253   if (nested_in_vect_loop)
8254     /* iv_loop is nested in the loop to be vectorized. Generate:
8255        vec_step = [S, S, S, S]  */
8256     new_name = step_expr;
8257   else
8258     {
8259       /* iv_loop is the loop to be vectorized. Generate:
8260           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8261       gimple_seq seq = NULL;
8262       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8263         {
8264           expr = build_int_cst (integer_type_node, vf);
8265           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8266         }
8267       else
8268         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8269       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8270                                expr, step_expr);
8271       if (seq)
8272         {
8273           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8274           gcc_assert (!new_bb);
8275         }
8276     }
8277
8278   t = unshare_expr (new_name);
8279   gcc_assert (CONSTANT_CLASS_P (new_name)
8280               || TREE_CODE (new_name) == SSA_NAME);
8281   new_vec = build_vector_from_val (step_vectype, t);
8282   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8283                                new_vec, step_vectype, NULL);
8284
8285
8286   /* Create the following def-use cycle:
8287      loop prolog:
8288          vec_init = ...
8289          vec_step = ...
8290      loop:
8291          vec_iv = PHI <vec_init, vec_loop>
8292          ...
8293          STMT
8294          ...
8295          vec_loop = vec_iv + vec_step;  */
8296
8297   /* Create the induction-phi that defines the induction-operand.  */
8298   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8299   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8300   induc_def = PHI_RESULT (induction_phi);
8301
8302   /* Create the iv update inside the loop  */
8303   stmts = NULL;
8304   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8305   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8306   vec_def = gimple_convert (&stmts, vectype, vec_def);
8307   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8308   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8309
8310   /* Set the arguments of the phi node:  */
8311   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8312   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8313                UNKNOWN_LOCATION);
8314
8315   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8316   *vec_stmt = induction_phi;
8317
8318   /* In case that vectorization factor (VF) is bigger than the number
8319      of elements that we can fit in a vectype (nunits), we have to generate
8320      more than one vector stmt - i.e - we need to "unroll" the
8321      vector stmt by a factor VF/nunits.  For more details see documentation
8322      in vectorizable_operation.  */
8323
8324   if (ncopies > 1)
8325     {
8326       gimple_seq seq = NULL;
8327       /* FORNOW. This restriction should be relaxed.  */
8328       gcc_assert (!nested_in_vect_loop);
8329
8330       /* Create the vector that holds the step of the induction.  */
8331       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8332         {
8333           expr = build_int_cst (integer_type_node, nunits);
8334           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8335         }
8336       else
8337         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8338       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8339                                expr, step_expr);
8340       if (seq)
8341         {
8342           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8343           gcc_assert (!new_bb);
8344         }
8345
8346       t = unshare_expr (new_name);
8347       gcc_assert (CONSTANT_CLASS_P (new_name)
8348                   || TREE_CODE (new_name) == SSA_NAME);
8349       new_vec = build_vector_from_val (step_vectype, t);
8350       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8351                                    new_vec, step_vectype, NULL);
8352
8353       vec_def = induc_def;
8354       for (i = 1; i < ncopies; i++)
8355         {
8356           /* vec_i = vec_prev + vec_step  */
8357           gimple_seq stmts = NULL;
8358           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8359           vec_def = gimple_build (&stmts,
8360                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8361           vec_def = gimple_convert (&stmts, vectype, vec_def);
8362
8363           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8364           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8365           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8366         }
8367     }
8368
8369   if (dump_enabled_p ())
8370     dump_printf_loc (MSG_NOTE, vect_location,
8371                      "transform induction: created def-use cycle: %G%G",
8372                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8373
8374   return true;
8375 }
8376
8377 /* Function vectorizable_live_operation.
8378
8379    STMT_INFO computes a value that is used outside the loop.  Check if
8380    it can be supported.  */
8381
8382 bool
8383 vectorizable_live_operation (vec_info *vinfo,
8384                              stmt_vec_info stmt_info,
8385                              gimple_stmt_iterator *gsi,
8386                              slp_tree slp_node, slp_instance slp_node_instance,
8387                              int slp_index, bool vec_stmt_p,
8388                              stmt_vector_for_cost *cost_vec)
8389 {
8390   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8391   imm_use_iterator imm_iter;
8392   tree lhs, lhs_type, bitsize, vec_bitsize;
8393   tree vectype = (slp_node
8394                   ? SLP_TREE_VECTYPE (slp_node)
8395                   : STMT_VINFO_VECTYPE (stmt_info));
8396   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8397   int ncopies;
8398   gimple *use_stmt;
8399   auto_vec<tree> vec_oprnds;
8400   int vec_entry = 0;
8401   poly_uint64 vec_index = 0;
8402
8403   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8404
8405   /* If a stmt of a reduction is live, vectorize it via
8406      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8407      validity so just trigger the transform here.  */
8408   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8409     {
8410       if (!vec_stmt_p)
8411         return true;
8412       if (slp_node)
8413         {
8414           /* For reduction chains the meta-info is attached to
8415              the group leader.  */
8416           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8417             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8418           /* For SLP reductions we vectorize the epilogue for
8419              all involved stmts together.  */
8420           else if (slp_index != 0)
8421             return true;
8422           else
8423             /* For SLP reductions the meta-info is attached to
8424                the representative.  */
8425             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8426         }
8427       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8428       gcc_assert (reduc_info->is_reduc_info);
8429       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8430           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8431         return true;
8432       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8433                                         slp_node_instance);
8434       return true;
8435     }
8436
8437   /* If STMT is not relevant and it is a simple assignment and its inputs are
8438      invariant then it can remain in place, unvectorized.  The original last
8439      scalar value that it computes will be used.  */
8440   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8441     {
8442       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8443       if (dump_enabled_p ())
8444         dump_printf_loc (MSG_NOTE, vect_location,
8445                          "statement is simple and uses invariant.  Leaving in "
8446                          "place.\n");
8447       return true;
8448     }
8449
8450   if (slp_node)
8451     ncopies = 1;
8452   else
8453     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8454
8455   if (slp_node)
8456     {
8457       gcc_assert (slp_index >= 0);
8458
8459       /* Get the last occurrence of the scalar index from the concatenation of
8460          all the slp vectors. Calculate which slp vector it is and the index
8461          within.  */
8462       int num_scalar = SLP_TREE_LANES (slp_node);
8463       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8464       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8465
8466       /* Calculate which vector contains the result, and which lane of
8467          that vector we need.  */
8468       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8469         {
8470           if (dump_enabled_p ())
8471             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8472                              "Cannot determine which vector holds the"
8473                              " final result.\n");
8474           return false;
8475         }
8476     }
8477
8478   if (!vec_stmt_p)
8479     {
8480       /* No transformation required.  */
8481       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8482         {
8483           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8484                                                OPTIMIZE_FOR_SPEED))
8485             {
8486               if (dump_enabled_p ())
8487                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8488                                  "can't operate on partial vectors "
8489                                  "because the target doesn't support extract "
8490                                  "last reduction.\n");
8491               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8492             }
8493           else if (slp_node)
8494             {
8495               if (dump_enabled_p ())
8496                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8497                                  "can't operate on partial vectors "
8498                                  "because an SLP statement is live after "
8499                                  "the loop.\n");
8500               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8501             }
8502           else if (ncopies > 1)
8503             {
8504               if (dump_enabled_p ())
8505                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8506                                  "can't operate on partial vectors "
8507                                  "because ncopies is greater than 1.\n");
8508               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8509             }
8510           else
8511             {
8512               gcc_assert (ncopies == 1 && !slp_node);
8513               vect_record_loop_mask (loop_vinfo,
8514                                      &LOOP_VINFO_MASKS (loop_vinfo),
8515                                      1, vectype, NULL);
8516             }
8517         }
8518       /* ???  Enable for loop costing as well.  */
8519       if (!loop_vinfo)
8520         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8521                           0, vect_epilogue);
8522       return true;
8523     }
8524
8525   /* Use the lhs of the original scalar statement.  */
8526   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8527   if (dump_enabled_p ())
8528     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8529                      "stmt %G", stmt);
8530
8531   lhs = gimple_get_lhs (stmt);
8532   lhs_type = TREE_TYPE (lhs);
8533
8534   bitsize = vector_element_bits_tree (vectype);
8535   vec_bitsize = TYPE_SIZE (vectype);
8536
8537   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8538   tree vec_lhs, bitstart;
8539   gimple *vec_stmt;
8540   if (slp_node)
8541     {
8542       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8543
8544       /* Get the correct slp vectorized stmt.  */
8545       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8546       vec_lhs = gimple_get_lhs (vec_stmt);
8547
8548       /* Get entry to use.  */
8549       bitstart = bitsize_int (vec_index);
8550       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8551     }
8552   else
8553     {
8554       /* For multiple copies, get the last copy.  */
8555       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8556       vec_lhs = gimple_get_lhs (vec_stmt);
8557
8558       /* Get the last lane in the vector.  */
8559       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8560     }
8561
8562   if (loop_vinfo)
8563     {
8564       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8565          requirement, insert one phi node for it.  It looks like:
8566            loop;
8567          BB:
8568            # lhs' = PHI <lhs>
8569          ==>
8570            loop;
8571          BB:
8572            # vec_lhs' = PHI <vec_lhs>
8573            new_tree = lane_extract <vec_lhs', ...>;
8574            lhs' = new_tree;  */
8575
8576       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8577       basic_block exit_bb = single_exit (loop)->dest;
8578       gcc_assert (single_pred_p (exit_bb));
8579
8580       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8581       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8582       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8583
8584       gimple_seq stmts = NULL;
8585       tree new_tree;
8586       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8587         {
8588           /* Emit:
8589
8590                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8591
8592              where VEC_LHS is the vectorized live-out result and MASK is
8593              the loop mask for the final iteration.  */
8594           gcc_assert (ncopies == 1 && !slp_node);
8595           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8596           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8597                                           1, vectype, 0);
8598           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8599                                           mask, vec_lhs_phi);
8600
8601           /* Convert the extracted vector element to the scalar type.  */
8602           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8603         }
8604       else
8605         {
8606           tree bftype = TREE_TYPE (vectype);
8607           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8608             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8609           new_tree = build3 (BIT_FIELD_REF, bftype,
8610                              vec_lhs_phi, bitsize, bitstart);
8611           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8612                                            &stmts, true, NULL_TREE);
8613         }
8614
8615       if (stmts)
8616         {
8617           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8618           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8619
8620           /* Remove existing phi from lhs and create one copy from new_tree.  */
8621           tree lhs_phi = NULL_TREE;
8622           gimple_stmt_iterator gsi;
8623           for (gsi = gsi_start_phis (exit_bb);
8624                !gsi_end_p (gsi); gsi_next (&gsi))
8625             {
8626               gimple *phi = gsi_stmt (gsi);
8627               if ((gimple_phi_arg_def (phi, 0) == lhs))
8628                 {
8629                   remove_phi_node (&gsi, false);
8630                   lhs_phi = gimple_phi_result (phi);
8631                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8632                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8633                   break;
8634                 }
8635             }
8636         }
8637
8638       /* Replace use of lhs with newly computed result.  If the use stmt is a
8639          single arg PHI, just replace all uses of PHI result.  It's necessary
8640          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8641       use_operand_p use_p;
8642       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8643         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8644             && !is_gimple_debug (use_stmt))
8645           {
8646             if (gimple_code (use_stmt) == GIMPLE_PHI
8647                 && gimple_phi_num_args (use_stmt) == 1)
8648               {
8649                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8650               }
8651             else
8652               {
8653                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8654                     SET_USE (use_p, new_tree);
8655               }
8656             update_stmt (use_stmt);
8657           }
8658     }
8659   else
8660     {
8661       /* For basic-block vectorization simply insert the lane-extraction.  */
8662       tree bftype = TREE_TYPE (vectype);
8663       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8664         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8665       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8666                               vec_lhs, bitsize, bitstart);
8667       gimple_seq stmts = NULL;
8668       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8669                                        &stmts, true, NULL_TREE);
8670       if (TREE_CODE (new_tree) == SSA_NAME
8671           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8672         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8673       if (is_a <gphi *> (vec_stmt))
8674         {
8675           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8676           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8677         }
8678       else
8679         {
8680           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8681           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8682         }
8683
8684       /* Replace use of lhs with newly computed result.  If the use stmt is a
8685          single arg PHI, just replace all uses of PHI result.  It's necessary
8686          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8687       use_operand_p use_p;
8688       stmt_vec_info use_stmt_info;
8689       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8690         if (!is_gimple_debug (use_stmt)
8691             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8692                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8693           {
8694             /* ???  This can happen when the live lane ends up being
8695                used in a vector construction code-generated by an
8696                external SLP node (and code-generation for that already
8697                happened).  See gcc.dg/vect/bb-slp-47.c.
8698                Doing this is what would happen if that vector CTOR
8699                were not code-generated yet so it is not too bad.
8700                ???  In fact we'd likely want to avoid this situation
8701                in the first place.  */
8702             if (TREE_CODE (new_tree) == SSA_NAME
8703                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8704                 && gimple_code (use_stmt) != GIMPLE_PHI
8705                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8706                                                 use_stmt))
8707               {
8708                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8709                 gcc_assert (code == CONSTRUCTOR
8710                             || code == VIEW_CONVERT_EXPR
8711                             || CONVERT_EXPR_CODE_P (code));
8712                 if (dump_enabled_p ())
8713                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8714                                    "Using original scalar computation for "
8715                                    "live lane because use preceeds vector "
8716                                    "def\n");
8717                 continue;
8718               }
8719             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8720               SET_USE (use_p, new_tree);
8721             update_stmt (use_stmt);
8722           }
8723     }
8724
8725   return true;
8726 }
8727
8728 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8729
8730 static void
8731 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8732 {
8733   ssa_op_iter op_iter;
8734   imm_use_iterator imm_iter;
8735   def_operand_p def_p;
8736   gimple *ustmt;
8737
8738   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8739     {
8740       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8741         {
8742           basic_block bb;
8743
8744           if (!is_gimple_debug (ustmt))
8745             continue;
8746
8747           bb = gimple_bb (ustmt);
8748
8749           if (!flow_bb_inside_loop_p (loop, bb))
8750             {
8751               if (gimple_debug_bind_p (ustmt))
8752                 {
8753                   if (dump_enabled_p ())
8754                     dump_printf_loc (MSG_NOTE, vect_location,
8755                                      "killing debug use\n");
8756
8757                   gimple_debug_bind_reset_value (ustmt);
8758                   update_stmt (ustmt);
8759                 }
8760               else
8761                 gcc_unreachable ();
8762             }
8763         }
8764     }
8765 }
8766
8767 /* Given loop represented by LOOP_VINFO, return true if computation of
8768    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8769    otherwise.  */
8770
8771 static bool
8772 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8773 {
8774   /* Constant case.  */
8775   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8776     {
8777       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8778       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8779
8780       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8781       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8782       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8783         return true;
8784     }
8785
8786   widest_int max;
8787   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8788   /* Check the upper bound of loop niters.  */
8789   if (get_max_loop_iterations (loop, &max))
8790     {
8791       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8792       signop sgn = TYPE_SIGN (type);
8793       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8794       if (max < type_max)
8795         return true;
8796     }
8797   return false;
8798 }
8799
8800 /* Return a mask type with half the number of elements as OLD_TYPE,
8801    given that it should have mode NEW_MODE.  */
8802
8803 tree
8804 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8805 {
8806   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8807   return build_truth_vector_type_for_mode (nunits, new_mode);
8808 }
8809
8810 /* Return a mask type with twice as many elements as OLD_TYPE,
8811    given that it should have mode NEW_MODE.  */
8812
8813 tree
8814 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8815 {
8816   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8817   return build_truth_vector_type_for_mode (nunits, new_mode);
8818 }
8819
8820 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8821    contain a sequence of NVECTORS masks that each control a vector of type
8822    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8823    these vector masks with the vector version of SCALAR_MASK.  */
8824
8825 void
8826 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8827                        unsigned int nvectors, tree vectype, tree scalar_mask)
8828 {
8829   gcc_assert (nvectors != 0);
8830   if (masks->length () < nvectors)
8831     masks->safe_grow_cleared (nvectors, true);
8832   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8833   /* The number of scalars per iteration and the number of vectors are
8834      both compile-time constants.  */
8835   unsigned int nscalars_per_iter
8836     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8837                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8838
8839   if (scalar_mask)
8840     {
8841       scalar_cond_masked_key cond (scalar_mask, nvectors);
8842       loop_vinfo->scalar_cond_masked_set.add (cond);
8843     }
8844
8845   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8846     {
8847       rgm->max_nscalars_per_iter = nscalars_per_iter;
8848       rgm->type = truth_type_for (vectype);
8849       rgm->factor = 1;
8850     }
8851 }
8852
8853 /* Given a complete set of masks MASKS, extract mask number INDEX
8854    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8855    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8856
8857    See the comment above vec_loop_masks for more details about the mask
8858    arrangement.  */
8859
8860 tree
8861 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8862                     unsigned int nvectors, tree vectype, unsigned int index)
8863 {
8864   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8865   tree mask_type = rgm->type;
8866
8867   /* Populate the rgroup's mask array, if this is the first time we've
8868      used it.  */
8869   if (rgm->controls.is_empty ())
8870     {
8871       rgm->controls.safe_grow_cleared (nvectors, true);
8872       for (unsigned int i = 0; i < nvectors; ++i)
8873         {
8874           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8875           /* Provide a dummy definition until the real one is available.  */
8876           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8877           rgm->controls[i] = mask;
8878         }
8879     }
8880
8881   tree mask = rgm->controls[index];
8882   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8883                 TYPE_VECTOR_SUBPARTS (vectype)))
8884     {
8885       /* A loop mask for data type X can be reused for data type Y
8886          if X has N times more elements than Y and if Y's elements
8887          are N times bigger than X's.  In this case each sequence
8888          of N elements in the loop mask will be all-zero or all-one.
8889          We can then view-convert the mask so that each sequence of
8890          N elements is replaced by a single element.  */
8891       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8892                               TYPE_VECTOR_SUBPARTS (vectype)));
8893       gimple_seq seq = NULL;
8894       mask_type = truth_type_for (vectype);
8895       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8896       if (seq)
8897         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8898     }
8899   return mask;
8900 }
8901
8902 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8903    lengths for controlling an operation on VECTYPE.  The operation splits
8904    each element of VECTYPE into FACTOR separate subelements, measuring the
8905    length as a number of these subelements.  */
8906
8907 void
8908 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8909                       unsigned int nvectors, tree vectype, unsigned int factor)
8910 {
8911   gcc_assert (nvectors != 0);
8912   if (lens->length () < nvectors)
8913     lens->safe_grow_cleared (nvectors, true);
8914   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8915
8916   /* The number of scalars per iteration, scalar occupied bytes and
8917      the number of vectors are both compile-time constants.  */
8918   unsigned int nscalars_per_iter
8919     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8920                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8921
8922   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8923     {
8924       /* For now, we only support cases in which all loads and stores fall back
8925          to VnQI or none do.  */
8926       gcc_assert (!rgl->max_nscalars_per_iter
8927                   || (rgl->factor == 1 && factor == 1)
8928                   || (rgl->max_nscalars_per_iter * rgl->factor
8929                       == nscalars_per_iter * factor));
8930       rgl->max_nscalars_per_iter = nscalars_per_iter;
8931       rgl->type = vectype;
8932       rgl->factor = factor;
8933     }
8934 }
8935
8936 /* Given a complete set of length LENS, extract length number INDEX for an
8937    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
8938
8939 tree
8940 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8941                    unsigned int nvectors, unsigned int index)
8942 {
8943   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8944
8945   /* Populate the rgroup's len array, if this is the first time we've
8946      used it.  */
8947   if (rgl->controls.is_empty ())
8948     {
8949       rgl->controls.safe_grow_cleared (nvectors, true);
8950       for (unsigned int i = 0; i < nvectors; ++i)
8951         {
8952           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8953           gcc_assert (len_type != NULL_TREE);
8954           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8955
8956           /* Provide a dummy definition until the real one is available.  */
8957           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8958           rgl->controls[i] = len;
8959         }
8960     }
8961
8962   return rgl->controls[index];
8963 }
8964
8965 /* Scale profiling counters by estimation for LOOP which is vectorized
8966    by factor VF.  */
8967
8968 static void
8969 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8970 {
8971   edge preheader = loop_preheader_edge (loop);
8972   /* Reduce loop iterations by the vectorization factor.  */
8973   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8974   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8975
8976   if (freq_h.nonzero_p ())
8977     {
8978       profile_probability p;
8979
8980       /* Avoid dropping loop body profile counter to 0 because of zero count
8981          in loop's preheader.  */
8982       if (!(freq_e == profile_count::zero ()))
8983         freq_e = freq_e.force_nonzero ();
8984       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8985       scale_loop_frequencies (loop, p);
8986     }
8987
8988   edge exit_e = single_exit (loop);
8989   exit_e->probability = profile_probability::always ()
8990                                  .apply_scale (1, new_est_niter + 1);
8991
8992   edge exit_l = single_pred_edge (loop->latch);
8993   profile_probability prob = exit_l->probability;
8994   exit_l->probability = exit_e->probability.invert ();
8995   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8996     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8997 }
8998
8999 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9000    latch edge values originally defined by it.  */
9001
9002 static void
9003 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9004                                      stmt_vec_info def_stmt_info)
9005 {
9006   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9007   if (!def || TREE_CODE (def) != SSA_NAME)
9008     return;
9009   stmt_vec_info phi_info;
9010   imm_use_iterator iter;
9011   use_operand_p use_p;
9012   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9013     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9014       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9015           && (phi_info = loop_vinfo->lookup_stmt (phi))
9016           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9017           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9018           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9019         {
9020           loop_p loop = gimple_bb (phi)->loop_father;
9021           edge e = loop_latch_edge (loop);
9022           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9023             {
9024               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9025               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9026               gcc_assert (phi_defs.length () == latch_defs.length ());
9027               for (unsigned i = 0; i < phi_defs.length (); ++i)
9028                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9029                              gimple_get_lhs (latch_defs[i]), e,
9030                              gimple_phi_arg_location (phi, e->dest_idx));
9031             }
9032         }
9033 }
9034
9035 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9036    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9037    stmt_vec_info.  */
9038
9039 static void
9040 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9041                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9042 {
9043   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9044   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9045
9046   if (dump_enabled_p ())
9047     dump_printf_loc (MSG_NOTE, vect_location,
9048                      "------>vectorizing statement: %G", stmt_info->stmt);
9049
9050   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9051     vect_loop_kill_debug_uses (loop, stmt_info);
9052
9053   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9054       && !STMT_VINFO_LIVE_P (stmt_info))
9055     return;
9056
9057   if (STMT_VINFO_VECTYPE (stmt_info))
9058     {
9059       poly_uint64 nunits
9060         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9061       if (!STMT_SLP_TYPE (stmt_info)
9062           && maybe_ne (nunits, vf)
9063           && dump_enabled_p ())
9064         /* For SLP VF is set according to unrolling factor, and not
9065            to vector size, hence for SLP this print is not valid.  */
9066         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9067     }
9068
9069   /* Pure SLP statements have already been vectorized.  We still need
9070      to apply loop vectorization to hybrid SLP statements.  */
9071   if (PURE_SLP_STMT (stmt_info))
9072     return;
9073
9074   if (dump_enabled_p ())
9075     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9076
9077   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9078     *seen_store = stmt_info;
9079 }
9080
9081 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9082    in the hash_map with its corresponding values.  */
9083
9084 static tree
9085 find_in_mapping (tree t, void *context)
9086 {
9087   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9088
9089   tree *value = mapping->get (t);
9090   return value ? *value : t;
9091 }
9092
9093 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9094    original loop that has now been vectorized.
9095
9096    The inits of the data_references need to be advanced with the number of
9097    iterations of the main loop.  This has been computed in vect_do_peeling and
9098    is stored in parameter ADVANCE.  We first restore the data_references
9099    initial offset with the values recored in ORIG_DRS_INIT.
9100
9101    Since the loop_vec_info of this EPILOGUE was constructed for the original
9102    loop, its stmt_vec_infos all point to the original statements.  These need
9103    to be updated to point to their corresponding copies as well as the SSA_NAMES
9104    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9105
9106    The data_reference's connections also need to be updated.  Their
9107    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9108    stmt_vec_infos, their statements need to point to their corresponding copy,
9109    if they are gather loads or scatter stores then their reference needs to be
9110    updated to point to its corresponding copy and finally we set
9111    'base_misaligned' to false as we have already peeled for alignment in the
9112    prologue of the main loop.  */
9113
9114 static void
9115 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9116 {
9117   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9118   auto_vec<gimple *> stmt_worklist;
9119   hash_map<tree,tree> mapping;
9120   gimple *orig_stmt, *new_stmt;
9121   gimple_stmt_iterator epilogue_gsi;
9122   gphi_iterator epilogue_phi_gsi;
9123   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9124   basic_block *epilogue_bbs = get_loop_body (epilogue);
9125   unsigned i;
9126
9127   free (LOOP_VINFO_BBS (epilogue_vinfo));
9128   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9129
9130   /* Advance data_reference's with the number of iterations of the previous
9131      loop and its prologue.  */
9132   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9133
9134
9135   /* The EPILOGUE loop is a copy of the original loop so they share the same
9136      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9137      point to the copied statements.  We also create a mapping of all LHS' in
9138      the original loop and all the LHS' in the EPILOGUE and create worklists to
9139      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9140   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9141     {
9142       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9143            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9144         {
9145           new_stmt = epilogue_phi_gsi.phi ();
9146
9147           gcc_assert (gimple_uid (new_stmt) > 0);
9148           stmt_vinfo
9149             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9150
9151           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9152           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9153
9154           mapping.put (gimple_phi_result (orig_stmt),
9155                        gimple_phi_result (new_stmt));
9156           /* PHI nodes can not have patterns or related statements.  */
9157           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9158                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9159         }
9160
9161       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9162            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9163         {
9164           new_stmt = gsi_stmt (epilogue_gsi);
9165           if (is_gimple_debug (new_stmt))
9166             continue;
9167
9168           gcc_assert (gimple_uid (new_stmt) > 0);
9169           stmt_vinfo
9170             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9171
9172           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9173           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9174
9175           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9176             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9177
9178           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9179             {
9180               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9181               for (gimple_stmt_iterator gsi = gsi_start (seq);
9182                    !gsi_end_p (gsi); gsi_next (&gsi))
9183                 stmt_worklist.safe_push (gsi_stmt (gsi));
9184             }
9185
9186           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9187           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9188             {
9189               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9190               stmt_worklist.safe_push (stmt);
9191               /* Set BB such that the assert in
9192                 'get_initial_def_for_reduction' is able to determine that
9193                 the BB of the related stmt is inside this loop.  */
9194               gimple_set_bb (stmt,
9195                              gimple_bb (new_stmt));
9196               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9197               gcc_assert (related_vinfo == NULL
9198                           || related_vinfo == stmt_vinfo);
9199             }
9200         }
9201     }
9202
9203   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9204      using the original main loop and thus need to be updated to refer to the
9205      cloned variables used in the epilogue.  */
9206   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9207     {
9208       gimple *stmt = stmt_worklist[i];
9209       tree *new_op;
9210
9211       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9212         {
9213           tree op = gimple_op (stmt, j);
9214           if ((new_op = mapping.get(op)))
9215             gimple_set_op (stmt, j, *new_op);
9216           else
9217             {
9218               /* PR92429: The last argument of simplify_replace_tree disables
9219                  folding when replacing arguments.  This is required as
9220                  otherwise you might end up with different statements than the
9221                  ones analyzed in vect_loop_analyze, leading to different
9222                  vectorization.  */
9223               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9224                                           &find_in_mapping, &mapping, false);
9225               gimple_set_op (stmt, j, op);
9226             }
9227         }
9228     }
9229
9230   struct data_reference *dr;
9231   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9232   FOR_EACH_VEC_ELT (datarefs, i, dr)
9233     {
9234       orig_stmt = DR_STMT (dr);
9235       gcc_assert (gimple_uid (orig_stmt) > 0);
9236       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9237       /* Data references for gather loads and scatter stores do not use the
9238          updated offset we set using ADVANCE.  Instead we have to make sure the
9239          reference in the data references point to the corresponding copy of
9240          the original in the epilogue.  */
9241       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9242           == VMAT_GATHER_SCATTER)
9243         {
9244           DR_REF (dr)
9245             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9246                                      &find_in_mapping, &mapping);
9247           DR_BASE_ADDRESS (dr)
9248             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9249                                      &find_in_mapping, &mapping);
9250         }
9251       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9252       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9253       /* The vector size of the epilogue is smaller than that of the main loop
9254          so the alignment is either the same or lower. This means the dr will
9255          thus by definition be aligned.  */
9256       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9257     }
9258
9259   epilogue_vinfo->shared->datarefs_copy.release ();
9260   epilogue_vinfo->shared->save_datarefs ();
9261 }
9262
9263 /* Function vect_transform_loop.
9264
9265    The analysis phase has determined that the loop is vectorizable.
9266    Vectorize the loop - created vectorized stmts to replace the scalar
9267    stmts in the loop, and update the loop exit condition.
9268    Returns scalar epilogue loop if any.  */
9269
9270 class loop *
9271 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9272 {
9273   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9274   class loop *epilogue = NULL;
9275   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9276   int nbbs = loop->num_nodes;
9277   int i;
9278   tree niters_vector = NULL_TREE;
9279   tree step_vector = NULL_TREE;
9280   tree niters_vector_mult_vf = NULL_TREE;
9281   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9282   unsigned int lowest_vf = constant_lower_bound (vf);
9283   gimple *stmt;
9284   bool check_profitability = false;
9285   unsigned int th;
9286
9287   DUMP_VECT_SCOPE ("vec_transform_loop");
9288
9289   loop_vinfo->shared->check_datarefs ();
9290
9291   /* Use the more conservative vectorization threshold.  If the number
9292      of iterations is constant assume the cost check has been performed
9293      by our caller.  If the threshold makes all loops profitable that
9294      run at least the (estimated) vectorization factor number of times
9295      checking is pointless, too.  */
9296   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9297   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9298     {
9299       if (dump_enabled_p ())
9300         dump_printf_loc (MSG_NOTE, vect_location,
9301                          "Profitability threshold is %d loop iterations.\n",
9302                          th);
9303       check_profitability = true;
9304     }
9305
9306   /* Make sure there exists a single-predecessor exit bb.  Do this before
9307      versioning.   */
9308   edge e = single_exit (loop);
9309   if (! single_pred_p (e->dest))
9310     {
9311       split_loop_exit_edge (e, true);
9312       if (dump_enabled_p ())
9313         dump_printf (MSG_NOTE, "split exit edge\n");
9314     }
9315
9316   /* Version the loop first, if required, so the profitability check
9317      comes first.  */
9318
9319   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9320     {
9321       class loop *sloop
9322         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9323       sloop->force_vectorize = false;
9324       check_profitability = false;
9325     }
9326
9327   /* Make sure there exists a single-predecessor exit bb also on the
9328      scalar loop copy.  Do this after versioning but before peeling
9329      so CFG structure is fine for both scalar and if-converted loop
9330      to make slpeel_duplicate_current_defs_from_edges face matched
9331      loop closed PHI nodes on the exit.  */
9332   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9333     {
9334       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9335       if (! single_pred_p (e->dest))
9336         {
9337           split_loop_exit_edge (e, true);
9338           if (dump_enabled_p ())
9339             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9340         }
9341     }
9342
9343   tree niters = vect_build_loop_niters (loop_vinfo);
9344   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9345   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9346   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9347   tree advance;
9348   drs_init_vec orig_drs_init;
9349
9350   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9351                               &step_vector, &niters_vector_mult_vf, th,
9352                               check_profitability, niters_no_overflow,
9353                               &advance);
9354
9355   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9356       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9357     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9358                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9359
9360   if (niters_vector == NULL_TREE)
9361     {
9362       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9363           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9364           && known_eq (lowest_vf, vf))
9365         {
9366           niters_vector
9367             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9368                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9369           step_vector = build_one_cst (TREE_TYPE (niters));
9370         }
9371       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9372         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9373                                      &step_vector, niters_no_overflow);
9374       else
9375         /* vect_do_peeling subtracted the number of peeled prologue
9376            iterations from LOOP_VINFO_NITERS.  */
9377         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9378                                      &niters_vector, &step_vector,
9379                                      niters_no_overflow);
9380     }
9381
9382   /* 1) Make sure the loop header has exactly two entries
9383      2) Make sure we have a preheader basic block.  */
9384
9385   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9386
9387   split_edge (loop_preheader_edge (loop));
9388
9389   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9390     /* This will deal with any possible peeling.  */
9391     vect_prepare_for_masked_peels (loop_vinfo);
9392
9393   /* Schedule the SLP instances first, then handle loop vectorization
9394      below.  */
9395   if (!loop_vinfo->slp_instances.is_empty ())
9396     {
9397       DUMP_VECT_SCOPE ("scheduling SLP instances");
9398       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9399     }
9400
9401   /* FORNOW: the vectorizer supports only loops which body consist
9402      of one basic block (header + empty latch). When the vectorizer will
9403      support more involved loop forms, the order by which the BBs are
9404      traversed need to be reconsidered.  */
9405
9406   for (i = 0; i < nbbs; i++)
9407     {
9408       basic_block bb = bbs[i];
9409       stmt_vec_info stmt_info;
9410
9411       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9412            gsi_next (&si))
9413         {
9414           gphi *phi = si.phi ();
9415           if (dump_enabled_p ())
9416             dump_printf_loc (MSG_NOTE, vect_location,
9417                              "------>vectorizing phi: %G", phi);
9418           stmt_info = loop_vinfo->lookup_stmt (phi);
9419           if (!stmt_info)
9420             continue;
9421
9422           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9423             vect_loop_kill_debug_uses (loop, stmt_info);
9424
9425           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9426               && !STMT_VINFO_LIVE_P (stmt_info))
9427             continue;
9428
9429           if (STMT_VINFO_VECTYPE (stmt_info)
9430               && (maybe_ne
9431                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9432               && dump_enabled_p ())
9433             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9434
9435           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9436                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9437                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9438                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9439                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9440               && ! PURE_SLP_STMT (stmt_info))
9441             {
9442               if (dump_enabled_p ())
9443                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9444               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9445             }
9446         }
9447
9448       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9449            gsi_next (&si))
9450         {
9451           gphi *phi = si.phi ();
9452           stmt_info = loop_vinfo->lookup_stmt (phi);
9453           if (!stmt_info)
9454             continue;
9455
9456           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9457               && !STMT_VINFO_LIVE_P (stmt_info))
9458             continue;
9459
9460           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9461                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9463                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9464                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9465               && ! PURE_SLP_STMT (stmt_info))
9466             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9467         }
9468
9469       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9470            !gsi_end_p (si);)
9471         {
9472           stmt = gsi_stmt (si);
9473           /* During vectorization remove existing clobber stmts.  */
9474           if (gimple_clobber_p (stmt))
9475             {
9476               unlink_stmt_vdef (stmt);
9477               gsi_remove (&si, true);
9478               release_defs (stmt);
9479             }
9480           else
9481             {
9482               /* Ignore vector stmts created in the outer loop.  */
9483               stmt_info = loop_vinfo->lookup_stmt (stmt);
9484
9485               /* vector stmts created in the outer-loop during vectorization of
9486                  stmts in an inner-loop may not have a stmt_info, and do not
9487                  need to be vectorized.  */
9488               stmt_vec_info seen_store = NULL;
9489               if (stmt_info)
9490                 {
9491                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9492                     {
9493                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9494                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9495                            !gsi_end_p (subsi); gsi_next (&subsi))
9496                         {
9497                           stmt_vec_info pat_stmt_info
9498                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9499                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9500                                                     &si, &seen_store);
9501                         }
9502                       stmt_vec_info pat_stmt_info
9503                         = STMT_VINFO_RELATED_STMT (stmt_info);
9504                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9505                                                 &seen_store);
9506                       maybe_set_vectorized_backedge_value (loop_vinfo,
9507                                                            pat_stmt_info);
9508                     }
9509                   else
9510                     {
9511                       vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9512                                                 &seen_store);
9513                       maybe_set_vectorized_backedge_value (loop_vinfo,
9514                                                            stmt_info);
9515                     }
9516                 }
9517               gsi_next (&si);
9518               if (seen_store)
9519                 {
9520                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9521                     /* Interleaving.  If IS_STORE is TRUE, the
9522                        vectorization of the interleaving chain was
9523                        completed - free all the stores in the chain.  */
9524                     vect_remove_stores (loop_vinfo,
9525                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9526                   else
9527                     /* Free the attached stmt_vec_info and remove the stmt.  */
9528                     loop_vinfo->remove_stmt (stmt_info);
9529                 }
9530             }
9531         }
9532
9533       /* Stub out scalar statements that must not survive vectorization.
9534          Doing this here helps with grouped statements, or statements that
9535          are involved in patterns.  */
9536       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9537            !gsi_end_p (gsi); gsi_next (&gsi))
9538         {
9539           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9540           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9541             {
9542               tree lhs = gimple_get_lhs (call);
9543               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9544                 {
9545                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9546                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9547                   gsi_replace (&gsi, new_stmt, true);
9548                 }
9549             }
9550         }
9551     }                           /* BBs in loop */
9552
9553   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9554      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9555   if (integer_onep (step_vector))
9556     niters_no_overflow = true;
9557   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9558                            niters_vector_mult_vf, !niters_no_overflow);
9559
9560   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9561   scale_profile_for_vect_loop (loop, assumed_vf);
9562
9563   /* True if the final iteration might not handle a full vector's
9564      worth of scalar iterations.  */
9565   bool final_iter_may_be_partial
9566     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9567   /* The minimum number of iterations performed by the epilogue.  This
9568      is 1 when peeling for gaps because we always need a final scalar
9569      iteration.  */
9570   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9571   /* +1 to convert latch counts to loop iteration counts,
9572      -min_epilogue_iters to remove iterations that cannot be performed
9573        by the vector code.  */
9574   int bias_for_lowest = 1 - min_epilogue_iters;
9575   int bias_for_assumed = bias_for_lowest;
9576   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9577   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9578     {
9579       /* When the amount of peeling is known at compile time, the first
9580          iteration will have exactly alignment_npeels active elements.
9581          In the worst case it will have at least one.  */
9582       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9583       bias_for_lowest += lowest_vf - min_first_active;
9584       bias_for_assumed += assumed_vf - min_first_active;
9585     }
9586   /* In these calculations the "- 1" converts loop iteration counts
9587      back to latch counts.  */
9588   if (loop->any_upper_bound)
9589     loop->nb_iterations_upper_bound
9590       = (final_iter_may_be_partial
9591          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9592                           lowest_vf) - 1
9593          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9594                            lowest_vf) - 1);
9595   if (loop->any_likely_upper_bound)
9596     loop->nb_iterations_likely_upper_bound
9597       = (final_iter_may_be_partial
9598          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9599                           + bias_for_lowest, lowest_vf) - 1
9600          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9601                            + bias_for_lowest, lowest_vf) - 1);
9602   if (loop->any_estimate)
9603     loop->nb_iterations_estimate
9604       = (final_iter_may_be_partial
9605          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9606                           assumed_vf) - 1
9607          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9608                            assumed_vf) - 1);
9609
9610   if (dump_enabled_p ())
9611     {
9612       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9613         {
9614           dump_printf_loc (MSG_NOTE, vect_location,
9615                            "LOOP VECTORIZED\n");
9616           if (loop->inner)
9617             dump_printf_loc (MSG_NOTE, vect_location,
9618                              "OUTER LOOP VECTORIZED\n");
9619           dump_printf (MSG_NOTE, "\n");
9620         }
9621       else
9622         dump_printf_loc (MSG_NOTE, vect_location,
9623                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9624                          GET_MODE_NAME (loop_vinfo->vector_mode));
9625     }
9626
9627   /* Loops vectorized with a variable factor won't benefit from
9628      unrolling/peeling.  */
9629   if (!vf.is_constant ())
9630     {
9631       loop->unroll = 1;
9632       if (dump_enabled_p ())
9633         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9634                          " variable-length vectorization factor\n");
9635     }
9636   /* Free SLP instances here because otherwise stmt reference counting
9637      won't work.  */
9638   slp_instance instance;
9639   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9640     vect_free_slp_instance (instance);
9641   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9642   /* Clear-up safelen field since its value is invalid after vectorization
9643      since vectorized loop can have loop-carried dependencies.  */
9644   loop->safelen = 0;
9645
9646   if (epilogue)
9647     {
9648       update_epilogue_loop_vinfo (epilogue, advance);
9649
9650       epilogue->simduid = loop->simduid;
9651       epilogue->force_vectorize = loop->force_vectorize;
9652       epilogue->dont_vectorize = false;
9653     }
9654
9655   return epilogue;
9656 }
9657
9658 /* The code below is trying to perform simple optimization - revert
9659    if-conversion for masked stores, i.e. if the mask of a store is zero
9660    do not perform it and all stored value producers also if possible.
9661    For example,
9662      for (i=0; i<n; i++)
9663        if (c[i])
9664         {
9665           p1[i] += 1;
9666           p2[i] = p3[i] +2;
9667         }
9668    this transformation will produce the following semi-hammock:
9669
9670    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9671      {
9672        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9673        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9674        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9675        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9676        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9677        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9678      }
9679 */
9680
9681 void
9682 optimize_mask_stores (class loop *loop)
9683 {
9684   basic_block *bbs = get_loop_body (loop);
9685   unsigned nbbs = loop->num_nodes;
9686   unsigned i;
9687   basic_block bb;
9688   class loop *bb_loop;
9689   gimple_stmt_iterator gsi;
9690   gimple *stmt;
9691   auto_vec<gimple *> worklist;
9692   auto_purge_vect_location sentinel;
9693
9694   vect_location = find_loop_location (loop);
9695   /* Pick up all masked stores in loop if any.  */
9696   for (i = 0; i < nbbs; i++)
9697     {
9698       bb = bbs[i];
9699       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9700            gsi_next (&gsi))
9701         {
9702           stmt = gsi_stmt (gsi);
9703           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9704             worklist.safe_push (stmt);
9705         }
9706     }
9707
9708   free (bbs);
9709   if (worklist.is_empty ())
9710     return;
9711
9712   /* Loop has masked stores.  */
9713   while (!worklist.is_empty ())
9714     {
9715       gimple *last, *last_store;
9716       edge e, efalse;
9717       tree mask;
9718       basic_block store_bb, join_bb;
9719       gimple_stmt_iterator gsi_to;
9720       tree vdef, new_vdef;
9721       gphi *phi;
9722       tree vectype;
9723       tree zero;
9724
9725       last = worklist.pop ();
9726       mask = gimple_call_arg (last, 2);
9727       bb = gimple_bb (last);
9728       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9729          the same loop as if_bb.  It could be different to LOOP when two
9730          level loop-nest is vectorized and mask_store belongs to the inner
9731          one.  */
9732       e = split_block (bb, last);
9733       bb_loop = bb->loop_father;
9734       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9735       join_bb = e->dest;
9736       store_bb = create_empty_bb (bb);
9737       add_bb_to_loop (store_bb, bb_loop);
9738       e->flags = EDGE_TRUE_VALUE;
9739       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9740       /* Put STORE_BB to likely part.  */
9741       efalse->probability = profile_probability::unlikely ();
9742       store_bb->count = efalse->count ();
9743       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9744       if (dom_info_available_p (CDI_DOMINATORS))
9745         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9746       if (dump_enabled_p ())
9747         dump_printf_loc (MSG_NOTE, vect_location,
9748                          "Create new block %d to sink mask stores.",
9749                          store_bb->index);
9750       /* Create vector comparison with boolean result.  */
9751       vectype = TREE_TYPE (mask);
9752       zero = build_zero_cst (vectype);
9753       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9754       gsi = gsi_last_bb (bb);
9755       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9756       /* Create new PHI node for vdef of the last masked store:
9757          .MEM_2 = VDEF <.MEM_1>
9758          will be converted to
9759          .MEM.3 = VDEF <.MEM_1>
9760          and new PHI node will be created in join bb
9761          .MEM_2 = PHI <.MEM_1, .MEM_3>
9762       */
9763       vdef = gimple_vdef (last);
9764       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9765       gimple_set_vdef (last, new_vdef);
9766       phi = create_phi_node (vdef, join_bb);
9767       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9768
9769       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9770       while (true)
9771         {
9772           gimple_stmt_iterator gsi_from;
9773           gimple *stmt1 = NULL;
9774
9775           /* Move masked store to STORE_BB.  */
9776           last_store = last;
9777           gsi = gsi_for_stmt (last);
9778           gsi_from = gsi;
9779           /* Shift GSI to the previous stmt for further traversal.  */
9780           gsi_prev (&gsi);
9781           gsi_to = gsi_start_bb (store_bb);
9782           gsi_move_before (&gsi_from, &gsi_to);
9783           /* Setup GSI_TO to the non-empty block start.  */
9784           gsi_to = gsi_start_bb (store_bb);
9785           if (dump_enabled_p ())
9786             dump_printf_loc (MSG_NOTE, vect_location,
9787                              "Move stmt to created bb\n%G", last);
9788           /* Move all stored value producers if possible.  */
9789           while (!gsi_end_p (gsi))
9790             {
9791               tree lhs;
9792               imm_use_iterator imm_iter;
9793               use_operand_p use_p;
9794               bool res;
9795
9796               /* Skip debug statements.  */
9797               if (is_gimple_debug (gsi_stmt (gsi)))
9798                 {
9799                   gsi_prev (&gsi);
9800                   continue;
9801                 }
9802               stmt1 = gsi_stmt (gsi);
9803               /* Do not consider statements writing to memory or having
9804                  volatile operand.  */
9805               if (gimple_vdef (stmt1)
9806                   || gimple_has_volatile_ops (stmt1))
9807                 break;
9808               gsi_from = gsi;
9809               gsi_prev (&gsi);
9810               lhs = gimple_get_lhs (stmt1);
9811               if (!lhs)
9812                 break;
9813
9814               /* LHS of vectorized stmt must be SSA_NAME.  */
9815               if (TREE_CODE (lhs) != SSA_NAME)
9816                 break;
9817
9818               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9819                 {
9820                   /* Remove dead scalar statement.  */
9821                   if (has_zero_uses (lhs))
9822                     {
9823                       gsi_remove (&gsi_from, true);
9824                       continue;
9825                     }
9826                 }
9827
9828               /* Check that LHS does not have uses outside of STORE_BB.  */
9829               res = true;
9830               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9831                 {
9832                   gimple *use_stmt;
9833                   use_stmt = USE_STMT (use_p);
9834                   if (is_gimple_debug (use_stmt))
9835                     continue;
9836                   if (gimple_bb (use_stmt) != store_bb)
9837                     {
9838                       res = false;
9839                       break;
9840                     }
9841                 }
9842               if (!res)
9843                 break;
9844
9845               if (gimple_vuse (stmt1)
9846                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9847                 break;
9848
9849               /* Can move STMT1 to STORE_BB.  */
9850               if (dump_enabled_p ())
9851                 dump_printf_loc (MSG_NOTE, vect_location,
9852                                  "Move stmt to created bb\n%G", stmt1);
9853               gsi_move_before (&gsi_from, &gsi_to);
9854               /* Shift GSI_TO for further insertion.  */
9855               gsi_prev (&gsi_to);
9856             }
9857           /* Put other masked stores with the same mask to STORE_BB.  */
9858           if (worklist.is_empty ()
9859               || gimple_call_arg (worklist.last (), 2) != mask
9860               || worklist.last () != stmt1)
9861             break;
9862           last = worklist.pop ();
9863         }
9864       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9865     }
9866 }
9867
9868 /* Decide whether it is possible to use a zero-based induction variable
9869    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9870    the value that the induction variable must be able to hold in order
9871    to ensure that the rgroups eventually have no active vector elements.
9872    Return -1 otherwise.  */
9873
9874 widest_int
9875 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9876 {
9877   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9878   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9879   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9880
9881   /* Calculate the value that the induction variable must be able
9882      to hit in order to ensure that we end the loop with an all-false mask.
9883      This involves adding the maximum number of inactive trailing scalar
9884      iterations.  */
9885   widest_int iv_limit = -1;
9886   if (max_loop_iterations (loop, &iv_limit))
9887     {
9888       if (niters_skip)
9889         {
9890           /* Add the maximum number of skipped iterations to the
9891              maximum iteration count.  */
9892           if (TREE_CODE (niters_skip) == INTEGER_CST)
9893             iv_limit += wi::to_widest (niters_skip);
9894           else
9895             iv_limit += max_vf - 1;
9896         }
9897       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9898         /* Make a conservatively-correct assumption.  */
9899         iv_limit += max_vf - 1;
9900
9901       /* IV_LIMIT is the maximum number of latch iterations, which is also
9902          the maximum in-range IV value.  Round this value down to the previous
9903          vector alignment boundary and then add an extra full iteration.  */
9904       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9905       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9906     }
9907   return iv_limit;
9908 }
9909
9910 /* For the given rgroup_controls RGC, check whether an induction variable
9911    would ever hit a value that produces a set of all-false masks or zero
9912    lengths before wrapping around.  Return true if it's possible to wrap
9913    around before hitting the desirable value, otherwise return false.  */
9914
9915 bool
9916 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9917 {
9918   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9919
9920   if (iv_limit == -1)
9921     return true;
9922
9923   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9924   unsigned int compare_precision = TYPE_PRECISION (compare_type);
9925   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9926
9927   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9928     return true;
9929
9930   return false;
9931 }