gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     if (STMT_VINFO_IN_PATTERN_P (first))
 670       {
 671         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672         while (next)
 673           {
 674             if (! STMT_VINFO_IN_PATTERN_P (next)
 675                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 676               break;
 677             next = REDUC_GROUP_NEXT_ELEMENT (next);
 678           }
 679         /* If not all stmt in the chain are patterns or if we failed
 680            to update STMT_VINFO_REDUC_IDX try to handle the chain
 681            without patterns.  */
 682         if (! next
 683             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 684           {
 685             vect_fixup_reduc_chain (first);
 686             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 687               = STMT_VINFO_RELATED_STMT (first);
 688           }
 689       }
 690 }
 691
 692 /* Function vect_get_loop_niters.
 693
 694    Determine how many iterations the loop is executed and place it
 695    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 696    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 697    niter information holds in ASSUMPTIONS.
 698
 699    Return the loop exit condition.  */
 700
 701
 702 static gcond *
 703 vect_get_loop_niters (class loop *loop, tree *assumptions,
 704                       tree *number_of_iterations, tree *number_of_iterationsm1)
 705 {
 706   edge exit = single_exit (loop);
 707   class tree_niter_desc niter_desc;
 708   tree niter_assumptions, niter, may_be_zero;
 709   gcond *cond = get_loop_exit_condition (loop);
 710
 711   *assumptions = boolean_true_node;
 712   *number_of_iterationsm1 = chrec_dont_know;
 713   *number_of_iterations = chrec_dont_know;
 714   DUMP_VECT_SCOPE ("get_loop_niters");
 715
 716   if (!exit)
 717     return cond;
 718
 719   may_be_zero = NULL_TREE;
 720   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 721       || chrec_contains_undetermined (niter_desc.niter))
 722     return cond;
 723
 724   niter_assumptions = niter_desc.assumptions;
 725   may_be_zero = niter_desc.may_be_zero;
 726   niter = niter_desc.niter;
 727
 728   if (may_be_zero && integer_zerop (may_be_zero))
 729     may_be_zero = NULL_TREE;
 730
 731   if (may_be_zero)
 732     {
 733       if (COMPARISON_CLASS_P (may_be_zero))
 734         {
 735           /* Try to combine may_be_zero with assumptions, this can simplify
 736              computation of niter expression.  */
 737           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 738             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 739                                              niter_assumptions,
 740                                              fold_build1 (TRUTH_NOT_EXPR,
 741                                                           boolean_type_node,
 742                                                           may_be_zero));
 743           else
 744             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 745                                  build_int_cst (TREE_TYPE (niter), 0),
 746                                  rewrite_to_non_trapping_overflow (niter));
 747
 748           may_be_zero = NULL_TREE;
 749         }
 750       else if (integer_nonzerop (may_be_zero))
 751         {
 752           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 753           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 754           return cond;
 755         }
 756       else
 757         return cond;
 758     }
 759
 760   *assumptions = niter_assumptions;
 761   *number_of_iterationsm1 = niter;
 762
 763   /* We want the number of loop header executions which is the number
 764      of latch executions plus one.
 765      ???  For UINT_MAX latch executions this number overflows to zero
 766      for loops like do { n++; } while (n != 0);  */
 767   if (niter && !chrec_contains_undetermined (niter))
 768     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 769                           build_int_cst (TREE_TYPE (niter), 1));
 770   *number_of_iterations = niter;
 771
 772   return cond;
 773 }
 774
 775 /* Function bb_in_loop_p
 776
 777    Used as predicate for dfs order traversal of the loop bbs.  */
 778
 779 static bool
 780 bb_in_loop_p (const_basic_block bb, const void *data)
 781 {
 782   const class loop *const loop = (const class loop *)data;
 783   if (flow_bb_inside_loop_p (loop, bb))
 784     return true;
 785   return false;
 786 }
 787
 788
 789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 790    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 791
 792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 793   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 794     loop (loop_in),
 795     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 796     num_itersm1 (NULL_TREE),
 797     num_iters (NULL_TREE),
 798     num_iters_unchanged (NULL_TREE),
 799     num_iters_assumptions (NULL_TREE),
 800     th (0),
 801     versioning_threshold (0),
 802     vectorization_factor (0),
 803     max_vectorization_factor (0),
 804     mask_skip_niters (NULL_TREE),
 805     rgroup_compare_type (NULL_TREE),
 806     simd_if_cond (NULL_TREE),
 807     unaligned_dr (NULL),
 808     peeling_for_alignment (0),
 809     ptr_mask (0),
 810     ivexpr_map (NULL),
 811     scan_map (NULL),
 812     slp_unrolling_factor (1),
 813     single_scalar_iteration_cost (0),
 814     vec_outside_cost (0),
 815     vec_inside_cost (0),
 816     vectorizable (false),
 817     can_use_partial_vectors_p (true),
 818     using_partial_vectors_p (false),
 819     epil_using_partial_vectors_p (false),
 820     peeling_for_gaps (false),
 821     peeling_for_niter (false),
 822     no_data_dependencies (false),
 823     has_mask_store (false),
 824     scalar_loop_scaling (profile_probability::uninitialized ()),
 825     scalar_loop (NULL),
 826     orig_loop_info (NULL)
 827 {
 828   /* CHECKME: We want to visit all BBs before their successors (except for
 829      latch blocks, for which this assertion wouldn't hold).  In the simple
 830      case of the loop forms we allow, a dfs order of the BBs would the same
 831      as reversed postorder traversal, so we are safe.  */
 832
 833   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 834                                           bbs, loop->num_nodes, loop);
 835   gcc_assert (nbbs == loop->num_nodes);
 836
 837   for (unsigned int i = 0; i < nbbs; i++)
 838     {
 839       basic_block bb = bbs[i];
 840       gimple_stmt_iterator si;
 841
 842       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 843         {
 844           gimple *phi = gsi_stmt (si);
 845           gimple_set_uid (phi, 0);
 846           add_stmt (phi);
 847         }
 848
 849       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 850         {
 851           gimple *stmt = gsi_stmt (si);
 852           gimple_set_uid (stmt, 0);
 853           if (is_gimple_debug (stmt))
 854             continue;
 855           add_stmt (stmt);
 856           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 857              third argument is the #pragma omp simd if (x) condition, when 0,
 858              loop shouldn't be vectorized, when non-zero constant, it should
 859              be vectorized normally, otherwise versioned with vectorized loop
 860              done if the condition is non-zero at runtime.  */
 861           if (loop_in->simduid
 862               && is_gimple_call (stmt)
 863               && gimple_call_internal_p (stmt)
 864               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 865               && gimple_call_num_args (stmt) >= 3
 866               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 867               && (loop_in->simduid
 868                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 869             {
 870               tree arg = gimple_call_arg (stmt, 2);
 871               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 872                 simd_if_cond = arg;
 873               else
 874                 gcc_assert (integer_nonzerop (arg));
 875             }
 876         }
 877     }
 878
 879   epilogue_vinfos.create (6);
 880 }
 881
 882 /* Free all levels of rgroup CONTROLS.  */
 883
 884 void
 885 release_vec_loop_controls (vec<rgroup_controls> *controls)
 886 {
 887   rgroup_controls *rgc;
 888   unsigned int i;
 889   FOR_EACH_VEC_ELT (*controls, i, rgc)
 890     rgc->controls.release ();
 891   controls->release ();
 892 }
 893
 894 /* Free all memory used by the _loop_vec_info, as well as all the
 895    stmt_vec_info structs of all the stmts in the loop.  */
 896
 897 _loop_vec_info::~_loop_vec_info ()
 898 {
 899   free (bbs);
 900
 901   release_vec_loop_controls (&masks);
 902   release_vec_loop_controls (&lens);
 903   delete ivexpr_map;
 904   delete scan_map;
 905   epilogue_vinfos.release ();
 906
 907   loop->aux = NULL;
 908 }
 909
 910 /* Return an invariant or register for EXPR and emit necessary
 911    computations in the LOOP_VINFO loop preheader.  */
 912
 913 tree
 914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 915 {
 916   if (is_gimple_reg (expr)
 917       || is_gimple_min_invariant (expr))
 918     return expr;
 919
 920   if (! loop_vinfo->ivexpr_map)
 921     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 922   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 923   if (! cached)
 924     {
 925       gimple_seq stmts = NULL;
 926       cached = force_gimple_operand (unshare_expr (expr),
 927                                      &stmts, true, NULL_TREE);
 928       if (stmts)
 929         {
 930           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 931           gsi_insert_seq_on_edge_immediate (e, stmts);
 932         }
 933     }
 934   return cached;
 935 }
 936
 937 /* Return true if we can use CMP_TYPE as the comparison type to produce
 938    all masks required to mask LOOP_VINFO.  */
 939
 940 static bool
 941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 942 {
 943   rgroup_controls *rgm;
 944   unsigned int i;
 945   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 946     if (rgm->type != NULL_TREE
 947         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 948                                             cmp_type, rgm->type,
 949                                             OPTIMIZE_FOR_SPEED))
 950       return false;
 951   return true;
 952 }
 953
 954 /* Calculate the maximum number of scalars per iteration for every
 955    rgroup in LOOP_VINFO.  */
 956
 957 static unsigned int
 958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 959 {
 960   unsigned int res = 1;
 961   unsigned int i;
 962   rgroup_controls *rgm;
 963   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 964     res = MAX (res, rgm->max_nscalars_per_iter);
 965   return res;
 966 }
 967
 968 /* Calculate the minimum precision necessary to represent:
 969
 970       MAX_NITERS * FACTOR
 971
 972    as an unsigned integer, where MAX_NITERS is the maximum number of
 973    loop header iterations for the original scalar form of LOOP_VINFO.  */
 974
 975 static unsigned
 976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
 977 {
 978   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 979
 980   /* Get the maximum number of iterations that is representable
 981      in the counter type.  */
 982   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 983   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 984
 985   /* Get a more refined estimate for the number of iterations.  */
 986   widest_int max_back_edges;
 987   if (max_loop_iterations (loop, &max_back_edges))
 988     max_ni = wi::smin (max_ni, max_back_edges + 1);
 989
 990   /* Work out how many bits we need to represent the limit.  */
 991   return wi::min_precision (max_ni * factor, UNSIGNED);
 992 }
 993
 994 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 995    whether we can actually generate the masks required.  Return true if so,
 996    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
 997
 998 static bool
 999 vect_verify_full_masking (loop_vec_info loop_vinfo)
1000 {
1001   unsigned int min_ni_width;
1002   unsigned int max_nscalars_per_iter
1003     = vect_get_max_nscalars_per_iter (loop_vinfo);
1004
1005   /* Use a normal loop if there are no statements that need masking.
1006      This only happens in rare degenerate cases: it means that the loop
1007      has no loads, no stores, and no live-out values.  */
1008   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009     return false;
1010
1011   /* Work out how many bits we need to represent the limit.  */
1012   min_ni_width
1013     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1014
1015   /* Find a scalar mode for which WHILE_ULT is supported.  */
1016   opt_scalar_int_mode cmp_mode_iter;
1017   tree cmp_type = NULL_TREE;
1018   tree iv_type = NULL_TREE;
1019   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1020   unsigned int iv_precision = UINT_MAX;
1021
1022   if (iv_limit != -1)
1023     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1024                                       UNSIGNED);
1025
1026   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1027     {
1028       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1029       if (cmp_bits >= min_ni_width
1030           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1031         {
1032           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1033           if (this_type
1034               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1035             {
1036               /* Although we could stop as soon as we find a valid mode,
1037                  there are at least two reasons why that's not always the
1038                  best choice:
1039
1040                  - An IV that's Pmode or wider is more likely to be reusable
1041                    in address calculations than an IV that's narrower than
1042                    Pmode.
1043
1044                  - Doing the comparison in IV_PRECISION or wider allows
1045                    a natural 0-based IV, whereas using a narrower comparison
1046                    type requires mitigations against wrap-around.
1047
1048                  Conversely, if the IV limit is variable, doing the comparison
1049                  in a wider type than the original type can introduce
1050                  unnecessary extensions, so picking the widest valid mode
1051                  is not always a good choice either.
1052
1053                  Here we prefer the first IV type that's Pmode or wider,
1054                  and the first comparison type that's IV_PRECISION or wider.
1055                  (The comparison type must be no wider than the IV type,
1056                  to avoid extensions in the vector loop.)
1057
1058                  ??? We might want to try continuing beyond Pmode for ILP32
1059                  targets if CMP_BITS < IV_PRECISION.  */
1060               iv_type = this_type;
1061               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1062                 cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1074   return true;
1075 }
1076
1077 /* Check whether we can use vector access with length based on precison
1078    comparison.  So far, to keep it simple, we only allow the case that the
1079    precision of the target supported length is larger than the precision
1080    required by loop niters.  */
1081
1082 static bool
1083 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1084 {
1085   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1086     return false;
1087
1088   unsigned int max_nitems_per_iter = 1;
1089   unsigned int i;
1090   rgroup_controls *rgl;
1091   /* Find the maximum number of items per iteration for every rgroup.  */
1092   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1093     {
1094       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1095       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1096     }
1097
1098   /* Work out how many bits we need to represent the length limit.  */
1099   unsigned int min_ni_prec
1100     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1101
1102   /* Now use the maximum of below precisions for one suitable IV type:
1103      - the IV's natural precision
1104      - the precision needed to hold: the maximum number of scalar
1105        iterations multiplied by the scale factor (min_ni_prec above)
1106      - the Pmode precision
1107
1108      If min_ni_prec is less than the precision of the current niters,
1109      we perfer to still use the niters type.  Prefer to use Pmode and
1110      wider IV to avoid narrow conversions.  */
1111
1112   unsigned int ni_prec
1113     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1114   min_ni_prec = MAX (min_ni_prec, ni_prec);
1115   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1116
1117   tree iv_type = NULL_TREE;
1118   opt_scalar_int_mode tmode_iter;
1119   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1120     {
1121       scalar_mode tmode = tmode_iter.require ();
1122       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1123
1124       /* ??? Do we really want to construct one IV whose precision exceeds
1125          BITS_PER_WORD?  */
1126       if (tbits > BITS_PER_WORD)
1127         break;
1128
1129       /* Find the first available standard integral type.  */
1130       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1131         {
1132           iv_type = build_nonstandard_integer_type (tbits, true);
1133           break;
1134         }
1135     }
1136
1137   if (!iv_type)
1138     {
1139       if (dump_enabled_p ())
1140         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                          "can't vectorize with length-based partial vectors"
1142                          " because there is no suitable iv type.\n");
1143       return false;
1144     }
1145
1146   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1147   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1148
1149   return true;
1150 }
1151
1152 /* Calculate the cost of one scalar iteration of the loop.  */
1153 static void
1154 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1155 {
1156   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1157   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1158   int nbbs = loop->num_nodes, factor;
1159   int innerloop_iters, i;
1160
1161   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1162
1163   /* Gather costs for statements in the scalar loop.  */
1164
1165   /* FORNOW.  */
1166   innerloop_iters = 1;
1167   if (loop->inner)
1168     innerloop_iters = 50; /* FIXME */
1169
1170   for (i = 0; i < nbbs; i++)
1171     {
1172       gimple_stmt_iterator si;
1173       basic_block bb = bbs[i];
1174
1175       if (bb->loop_father == loop->inner)
1176         factor = innerloop_iters;
1177       else
1178         factor = 1;
1179
1180       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1181         {
1182           gimple *stmt = gsi_stmt (si);
1183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1184
1185           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1186             continue;
1187
1188           /* Skip stmts that are not vectorized inside the loop.  */
1189           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1190           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1191               && (!STMT_VINFO_LIVE_P (vstmt_info)
1192                   || !VECTORIZABLE_CYCLE_DEF
1193                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1194             continue;
1195
1196           vect_cost_for_stmt kind;
1197           if (STMT_VINFO_DATA_REF (stmt_info))
1198             {
1199               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1200                kind = scalar_load;
1201              else
1202                kind = scalar_store;
1203             }
1204           else if (vect_nop_conversion_p (stmt_info))
1205             continue;
1206           else
1207             kind = scalar_stmt;
1208
1209           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1210                             factor, kind, stmt_info, 0, vect_prologue);
1211         }
1212     }
1213
1214   /* Now accumulate cost.  */
1215   void *target_cost_data = init_cost (loop);
1216   stmt_info_for_cost *si;
1217   int j;
1218   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1219                     j, si)
1220     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1221                           si->kind, si->stmt_info, si->vectype,
1222                           si->misalign, vect_body);
1223   unsigned dummy, body_cost = 0;
1224   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1225   destroy_cost_data (target_cost_data);
1226   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1227 }
1228
1229
1230 /* Function vect_analyze_loop_form_1.
1231
1232    Verify that certain CFG restrictions hold, including:
1233    - the loop has a pre-header
1234    - the loop has a single entry and exit
1235    - the loop exit condition is simple enough
1236    - the number of iterations can be analyzed, i.e, a countable loop.  The
1237      niter could be analyzed under some assumptions.  */
1238
1239 opt_result
1240 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1241                           tree *assumptions, tree *number_of_iterationsm1,
1242                           tree *number_of_iterations, gcond **inner_loop_cond)
1243 {
1244   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1245
1246   /* Different restrictions apply when we are considering an inner-most loop,
1247      vs. an outer (nested) loop.
1248      (FORNOW. May want to relax some of these restrictions in the future).  */
1249
1250   if (!loop->inner)
1251     {
1252       /* Inner-most loop.  We currently require that the number of BBs is
1253          exactly 2 (the header and latch).  Vectorizable inner-most loops
1254          look like this:
1255
1256                         (pre-header)
1257                            |
1258                           header <--------+
1259                            | |            |
1260                            | +--> latch --+
1261                            |
1262                         (exit-bb)  */
1263
1264       if (loop->num_nodes != 2)
1265         return opt_result::failure_at (vect_location,
1266                                        "not vectorized:"
1267                                        " control flow in loop.\n");
1268
1269       if (empty_block_p (loop->header))
1270         return opt_result::failure_at (vect_location,
1271                                        "not vectorized: empty loop.\n");
1272     }
1273   else
1274     {
1275       class loop *innerloop = loop->inner;
1276       edge entryedge;
1277
1278       /* Nested loop. We currently require that the loop is doubly-nested,
1279          contains a single inner loop, and the number of BBs is exactly 5.
1280          Vectorizable outer-loops look like this:
1281
1282                         (pre-header)
1283                            |
1284                           header <---+
1285                            |         |
1286                           inner-loop |
1287                            |         |
1288                           tail ------+
1289                            |
1290                         (exit-bb)
1291
1292          The inner-loop has the properties expected of inner-most loops
1293          as described above.  */
1294
1295       if ((loop->inner)->inner || (loop->inner)->next)
1296         return opt_result::failure_at (vect_location,
1297                                        "not vectorized:"
1298                                        " multiple nested loops.\n");
1299
1300       if (loop->num_nodes != 5)
1301         return opt_result::failure_at (vect_location,
1302                                        "not vectorized:"
1303                                        " control flow in loop.\n");
1304
1305       entryedge = loop_preheader_edge (innerloop);
1306       if (entryedge->src != loop->header
1307           || !single_exit (innerloop)
1308           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1309         return opt_result::failure_at (vect_location,
1310                                        "not vectorized:"
1311                                        " unsupported outerloop form.\n");
1312
1313       /* Analyze the inner-loop.  */
1314       tree inner_niterm1, inner_niter, inner_assumptions;
1315       opt_result res
1316         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1317                                     &inner_assumptions, &inner_niterm1,
1318                                     &inner_niter, NULL);
1319       if (!res)
1320         {
1321           if (dump_enabled_p ())
1322             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323                              "not vectorized: Bad inner loop.\n");
1324           return res;
1325         }
1326
1327       /* Don't support analyzing niter under assumptions for inner
1328          loop.  */
1329       if (!integer_onep (inner_assumptions))
1330         return opt_result::failure_at (vect_location,
1331                                        "not vectorized: Bad inner loop.\n");
1332
1333       if (!expr_invariant_in_loop_p (loop, inner_niter))
1334         return opt_result::failure_at (vect_location,
1335                                        "not vectorized: inner-loop count not"
1336                                        " invariant.\n");
1337
1338       if (dump_enabled_p ())
1339         dump_printf_loc (MSG_NOTE, vect_location,
1340                          "Considering outer-loop vectorization.\n");
1341     }
1342
1343   if (!single_exit (loop))
1344     return opt_result::failure_at (vect_location,
1345                                    "not vectorized: multiple exits.\n");
1346   if (EDGE_COUNT (loop->header->preds) != 2)
1347     return opt_result::failure_at (vect_location,
1348                                    "not vectorized:"
1349                                    " too many incoming edges.\n");
1350
1351   /* We assume that the loop exit condition is at the end of the loop. i.e,
1352      that the loop is represented as a do-while (with a proper if-guard
1353      before the loop if needed), where the loop header contains all the
1354      executable statements, and the latch is empty.  */
1355   if (!empty_block_p (loop->latch)
1356       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1357     return opt_result::failure_at (vect_location,
1358                                    "not vectorized: latch block not empty.\n");
1359
1360   /* Make sure the exit is not abnormal.  */
1361   edge e = single_exit (loop);
1362   if (e->flags & EDGE_ABNORMAL)
1363     return opt_result::failure_at (vect_location,
1364                                    "not vectorized:"
1365                                    " abnormal loop exit edge.\n");
1366
1367   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1368                                      number_of_iterationsm1);
1369   if (!*loop_cond)
1370     return opt_result::failure_at
1371       (vect_location,
1372        "not vectorized: complicated exit condition.\n");
1373
1374   if (integer_zerop (*assumptions)
1375       || !*number_of_iterations
1376       || chrec_contains_undetermined (*number_of_iterations))
1377     return opt_result::failure_at
1378       (*loop_cond,
1379        "not vectorized: number of iterations cannot be computed.\n");
1380
1381   if (integer_zerop (*number_of_iterations))
1382     return opt_result::failure_at
1383       (*loop_cond,
1384        "not vectorized: number of iterations = 0.\n");
1385
1386   return opt_result::success ();
1387 }
1388
1389 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1390
1391 opt_loop_vec_info
1392 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1393 {
1394   tree assumptions, number_of_iterations, number_of_iterationsm1;
1395   gcond *loop_cond, *inner_loop_cond = NULL;
1396
1397   opt_result res
1398     = vect_analyze_loop_form_1 (loop, &loop_cond,
1399                                 &assumptions, &number_of_iterationsm1,
1400                                 &number_of_iterations, &inner_loop_cond);
1401   if (!res)
1402     return opt_loop_vec_info::propagate_failure (res);
1403
1404   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1405   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1406   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1407   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1408   if (!integer_onep (assumptions))
1409     {
1410       /* We consider to vectorize this loop by versioning it under
1411          some assumptions.  In order to do this, we need to clear
1412          existing information computed by scev and niter analyzer.  */
1413       scev_reset_htab ();
1414       free_numbers_of_iterations_estimates (loop);
1415       /* Also set flag for this loop so that following scev and niter
1416          analysis are done under the assumptions.  */
1417       loop_constraint_set (loop, LOOP_C_FINITE);
1418       /* Also record the assumptions for versioning.  */
1419       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1420     }
1421
1422   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1423     {
1424       if (dump_enabled_p ())
1425         {
1426           dump_printf_loc (MSG_NOTE, vect_location,
1427                            "Symbolic number of iterations is ");
1428           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1429           dump_printf (MSG_NOTE, "\n");
1430         }
1431     }
1432
1433   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1434   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1435   if (inner_loop_cond)
1436     {
1437       stmt_vec_info inner_loop_cond_info
1438         = loop_vinfo->lookup_stmt (inner_loop_cond);
1439       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1440     }
1441
1442   gcc_assert (!loop->aux);
1443   loop->aux = loop_vinfo;
1444   return opt_loop_vec_info::success (loop_vinfo);
1445 }
1446
1447
1448
1449 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1450    statements update the vectorization factor.  */
1451
1452 static void
1453 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1454 {
1455   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1456   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1457   int nbbs = loop->num_nodes;
1458   poly_uint64 vectorization_factor;
1459   int i;
1460
1461   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1462
1463   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1464   gcc_assert (known_ne (vectorization_factor, 0U));
1465
1466   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1467      vectorization factor of the loop is the unrolling factor required by
1468      the SLP instances.  If that unrolling factor is 1, we say, that we
1469      perform pure SLP on loop - cross iteration parallelism is not
1470      exploited.  */
1471   bool only_slp_in_loop = true;
1472   for (i = 0; i < nbbs; i++)
1473     {
1474       basic_block bb = bbs[i];
1475       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1476            gsi_next (&si))
1477         {
1478           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1479           if (!stmt_info)
1480             continue;
1481           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1482                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1483               && !PURE_SLP_STMT (stmt_info))
1484             /* STMT needs both SLP and loop-based vectorization.  */
1485             only_slp_in_loop = false;
1486         }
1487       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1488            gsi_next (&si))
1489         {
1490           if (is_gimple_debug (gsi_stmt (si)))
1491             continue;
1492           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1493           stmt_info = vect_stmt_to_vectorize (stmt_info);
1494           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496               && !PURE_SLP_STMT (stmt_info))
1497             /* STMT needs both SLP and loop-based vectorization.  */
1498             only_slp_in_loop = false;
1499         }
1500     }
1501
1502   if (only_slp_in_loop)
1503     {
1504       if (dump_enabled_p ())
1505         dump_printf_loc (MSG_NOTE, vect_location,
1506                          "Loop contains only SLP stmts\n");
1507       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1508     }
1509   else
1510     {
1511       if (dump_enabled_p ())
1512         dump_printf_loc (MSG_NOTE, vect_location,
1513                          "Loop contains SLP and non-SLP stmts\n");
1514       /* Both the vectorization factor and unroll factor have the form
1515          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1516          so they must have a common multiple.  */
1517       vectorization_factor
1518         = force_common_multiple (vectorization_factor,
1519                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1520     }
1521
1522   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1523   if (dump_enabled_p ())
1524     {
1525       dump_printf_loc (MSG_NOTE, vect_location,
1526                        "Updating vectorization factor to ");
1527       dump_dec (MSG_NOTE, vectorization_factor);
1528       dump_printf (MSG_NOTE, ".\n");
1529     }
1530 }
1531
1532 /* Return true if STMT_INFO describes a double reduction phi and if
1533    the other phi in the reduction is also relevant for vectorization.
1534    This rejects cases such as:
1535
1536       outer1:
1537         x_1 = PHI <x_3(outer2), ...>;
1538         ...
1539
1540       inner:
1541         x_2 = ...;
1542         ...
1543
1544       outer2:
1545         x_3 = PHI <x_2(inner)>;
1546
1547    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1548
1549 static bool
1550 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1551 {
1552   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1553     return false;
1554
1555   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1556 }
1557
1558 /* Function vect_analyze_loop_operations.
1559
1560    Scan the loop stmts and make sure they are all vectorizable.  */
1561
1562 static opt_result
1563 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1564 {
1565   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567   int nbbs = loop->num_nodes;
1568   int i;
1569   stmt_vec_info stmt_info;
1570   bool need_to_vectorize = false;
1571   bool ok;
1572
1573   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1574
1575   auto_vec<stmt_info_for_cost> cost_vec;
1576
1577   for (i = 0; i < nbbs; i++)
1578     {
1579       basic_block bb = bbs[i];
1580
1581       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1582            gsi_next (&si))
1583         {
1584           gphi *phi = si.phi ();
1585           ok = true;
1586
1587           stmt_info = loop_vinfo->lookup_stmt (phi);
1588           if (dump_enabled_p ())
1589             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1590           if (virtual_operand_p (gimple_phi_result (phi)))
1591             continue;
1592
1593           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1594              (i.e., a phi in the tail of the outer-loop).  */
1595           if (! is_loop_header_bb_p (bb))
1596             {
1597               /* FORNOW: we currently don't support the case that these phis
1598                  are not used in the outerloop (unless it is double reduction,
1599                  i.e., this phi is vect_reduction_def), cause this case
1600                  requires to actually do something here.  */
1601               if (STMT_VINFO_LIVE_P (stmt_info)
1602                   && !vect_active_double_reduction_p (stmt_info))
1603                 return opt_result::failure_at (phi,
1604                                                "Unsupported loop-closed phi"
1605                                                " in outer-loop.\n");
1606
1607               /* If PHI is used in the outer loop, we check that its operand
1608                  is defined in the inner loop.  */
1609               if (STMT_VINFO_RELEVANT_P (stmt_info))
1610                 {
1611                   tree phi_op;
1612
1613                   if (gimple_phi_num_args (phi) != 1)
1614                     return opt_result::failure_at (phi, "unsupported phi");
1615
1616                   phi_op = PHI_ARG_DEF (phi, 0);
1617                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1618                   if (!op_def_info)
1619                     return opt_result::failure_at (phi, "unsupported phi\n");
1620
1621                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1622                       && (STMT_VINFO_RELEVANT (op_def_info)
1623                           != vect_used_in_outer_by_reduction))
1624                     return opt_result::failure_at (phi, "unsupported phi\n");
1625
1626                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1627                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1628                            == vect_double_reduction_def))
1629                       && !vectorizable_lc_phi (loop_vinfo,
1630                                                stmt_info, NULL, NULL))
1631                     return opt_result::failure_at (phi, "unsupported phi\n");
1632                 }
1633
1634               continue;
1635             }
1636
1637           gcc_assert (stmt_info);
1638
1639           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1640                || STMT_VINFO_LIVE_P (stmt_info))
1641               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1642             /* A scalar-dependence cycle that we don't support.  */
1643             return opt_result::failure_at (phi,
1644                                            "not vectorized:"
1645                                            " scalar dependence cycle.\n");
1646
1647           if (STMT_VINFO_RELEVANT_P (stmt_info))
1648             {
1649               need_to_vectorize = true;
1650               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1651                   && ! PURE_SLP_STMT (stmt_info))
1652                 ok = vectorizable_induction (loop_vinfo,
1653                                              stmt_info, NULL, NULL,
1654                                              &cost_vec);
1655               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1656                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1657                             == vect_double_reduction_def)
1658                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1659                        && ! PURE_SLP_STMT (stmt_info))
1660                 ok = vectorizable_reduction (loop_vinfo,
1661                                              stmt_info, NULL, NULL, &cost_vec);
1662             }
1663
1664           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1665           if (ok
1666               && STMT_VINFO_LIVE_P (stmt_info)
1667               && !PURE_SLP_STMT (stmt_info))
1668             ok = vectorizable_live_operation (loop_vinfo,
1669                                               stmt_info, NULL, NULL, NULL,
1670                                               -1, false, &cost_vec);
1671
1672           if (!ok)
1673             return opt_result::failure_at (phi,
1674                                            "not vectorized: relevant phi not "
1675                                            "supported: %G",
1676                                            static_cast <gimple *> (phi));
1677         }
1678
1679       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1680            gsi_next (&si))
1681         {
1682           gimple *stmt = gsi_stmt (si);
1683           if (!gimple_clobber_p (stmt)
1684               && !is_gimple_debug (stmt))
1685             {
1686               opt_result res
1687                 = vect_analyze_stmt (loop_vinfo,
1688                                      loop_vinfo->lookup_stmt (stmt),
1689                                      &need_to_vectorize,
1690                                      NULL, NULL, &cost_vec);
1691               if (!res)
1692                 return res;
1693             }
1694         }
1695     } /* bbs */
1696
1697   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1698
1699   /* All operations in the loop are either irrelevant (deal with loop
1700      control, or dead), or only used outside the loop and can be moved
1701      out of the loop (e.g. invariants, inductions).  The loop can be
1702      optimized away by scalar optimizations.  We're better off not
1703      touching this loop.  */
1704   if (!need_to_vectorize)
1705     {
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location,
1708                          "All the computation can be taken out of the loop.\n");
1709       return opt_result::failure_at
1710         (vect_location,
1711          "not vectorized: redundant loop. no profit to vectorize.\n");
1712     }
1713
1714   return opt_result::success ();
1715 }
1716
1717 /* Return true if we know that the iteration count is smaller than the
1718    vectorization factor.  Return false if it isn't, or if we can't be sure
1719    either way.  */
1720
1721 static bool
1722 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1723 {
1724   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1725
1726   HOST_WIDE_INT max_niter;
1727   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1728     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1729   else
1730     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1731
1732   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1733     return true;
1734
1735   return false;
1736 }
1737
1738 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1739    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1740    definitely no, or -1 if it's worth retrying.  */
1741
1742 static int
1743 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1744 {
1745   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1746   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1747
1748   /* Only loops that can handle partially-populated vectors can have iteration
1749      counts less than the vectorization factor.  */
1750   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1751     {
1752       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1753         {
1754           if (dump_enabled_p ())
1755             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756                              "not vectorized: iteration count smaller than "
1757                              "vectorization factor.\n");
1758           return 0;
1759         }
1760     }
1761
1762   int min_profitable_iters, min_profitable_estimate;
1763   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1764                                       &min_profitable_estimate);
1765
1766   if (min_profitable_iters < 0)
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770                          "not vectorized: vectorization not profitable.\n");
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773                          "not vectorized: vector version will never be "
1774                          "profitable.\n");
1775       return -1;
1776     }
1777
1778   int min_scalar_loop_bound = (param_min_vect_loop_bound
1779                                * assumed_vf);
1780
1781   /* Use the cost model only if it is more conservative than user specified
1782      threshold.  */
1783   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1784                                     min_profitable_iters);
1785
1786   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1787
1788   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1789       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1790     {
1791       if (dump_enabled_p ())
1792         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1793                          "not vectorized: vectorization not profitable.\n");
1794       if (dump_enabled_p ())
1795         dump_printf_loc (MSG_NOTE, vect_location,
1796                          "not vectorized: iteration count smaller than user "
1797                          "specified loop bound parameter or minimum profitable "
1798                          "iterations (whichever is more conservative).\n");
1799       return 0;
1800     }
1801
1802   /* The static profitablity threshold min_profitable_estimate includes
1803      the cost of having to check at runtime whether the scalar loop
1804      should be used instead.  If it turns out that we don't need or want
1805      such a check, the threshold we should use for the static estimate
1806      is simply the point at which the vector loop becomes more profitable
1807      than the scalar loop.  */
1808   if (min_profitable_estimate > min_profitable_iters
1809       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1810       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1811       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1812       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1813     {
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1816                          " choice between the scalar and vector loops\n");
1817       min_profitable_estimate = min_profitable_iters;
1818     }
1819
1820   HOST_WIDE_INT estimated_niter;
1821
1822   /* If we are vectorizing an epilogue then we know the maximum number of
1823      scalar iterations it will cover is at least one lower than the
1824      vectorization factor of the main loop.  */
1825   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1826     estimated_niter
1827       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1828   else
1829     {
1830       estimated_niter = estimated_stmt_executions_int (loop);
1831       if (estimated_niter == -1)
1832         estimated_niter = likely_max_stmt_executions_int (loop);
1833     }
1834   if (estimated_niter != -1
1835       && ((unsigned HOST_WIDE_INT) estimated_niter
1836           < MAX (th, (unsigned) min_profitable_estimate)))
1837     {
1838       if (dump_enabled_p ())
1839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840                          "not vectorized: estimated iteration count too "
1841                          "small.\n");
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_NOTE, vect_location,
1844                          "not vectorized: estimated iteration count smaller "
1845                          "than specified loop bound parameter or minimum "
1846                          "profitable iterations (whichever is more "
1847                          "conservative).\n");
1848       return -1;
1849     }
1850
1851   return 1;
1852 }
1853
1854 static opt_result
1855 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1856                            vec<data_reference_p> *datarefs,
1857                            unsigned int *n_stmts)
1858 {
1859   *n_stmts = 0;
1860   for (unsigned i = 0; i < loop->num_nodes; i++)
1861     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1862          !gsi_end_p (gsi); gsi_next (&gsi))
1863       {
1864         gimple *stmt = gsi_stmt (gsi);
1865         if (is_gimple_debug (stmt))
1866           continue;
1867         ++(*n_stmts);
1868         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1869         if (!res)
1870           {
1871             if (is_gimple_call (stmt) && loop->safelen)
1872               {
1873                 tree fndecl = gimple_call_fndecl (stmt), op;
1874                 if (fndecl != NULL_TREE)
1875                   {
1876                     cgraph_node *node = cgraph_node::get (fndecl);
1877                     if (node != NULL && node->simd_clones != NULL)
1878                       {
1879                         unsigned int j, n = gimple_call_num_args (stmt);
1880                         for (j = 0; j < n; j++)
1881                           {
1882                             op = gimple_call_arg (stmt, j);
1883                             if (DECL_P (op)
1884                                 || (REFERENCE_CLASS_P (op)
1885                                     && get_base_address (op)))
1886                               break;
1887                           }
1888                         op = gimple_call_lhs (stmt);
1889                         /* Ignore #pragma omp declare simd functions
1890                            if they don't have data references in the
1891                            call stmt itself.  */
1892                         if (j == n
1893                             && !(op
1894                                  && (DECL_P (op)
1895                                      || (REFERENCE_CLASS_P (op)
1896                                          && get_base_address (op)))))
1897                           continue;
1898                       }
1899                   }
1900               }
1901             return res;
1902           }
1903         /* If dependence analysis will give up due to the limit on the
1904            number of datarefs stop here and fail fatally.  */
1905         if (datarefs->length ()
1906             > (unsigned)param_loop_max_datarefs_for_datadeps)
1907           return opt_result::failure_at (stmt, "exceeded param "
1908                                          "loop-max-datarefs-for-datadeps\n");
1909       }
1910   return opt_result::success ();
1911 }
1912
1913 /* Look for SLP-only access groups and turn each individual access into its own
1914    group.  */
1915 static void
1916 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1917 {
1918   unsigned int i;
1919   struct data_reference *dr;
1920
1921   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1922
1923   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1924   FOR_EACH_VEC_ELT (datarefs, i, dr)
1925     {
1926       gcc_assert (DR_REF (dr));
1927       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1928
1929       /* Check if the load is a part of an interleaving chain.  */
1930       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1931         {
1932           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1933           unsigned int group_size = DR_GROUP_SIZE (first_element);
1934
1935           /* Check if SLP-only groups.  */
1936           if (!STMT_SLP_TYPE (stmt_info)
1937               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1938             {
1939               /* Dissolve the group.  */
1940               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1941
1942               stmt_vec_info vinfo = first_element;
1943               while (vinfo)
1944                 {
1945                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1946                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1947                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1948                   DR_GROUP_SIZE (vinfo) = 1;
1949                   if (STMT_VINFO_STRIDED_P (first_element))
1950                     DR_GROUP_GAP (vinfo) = 0;
1951                   else
1952                     DR_GROUP_GAP (vinfo) = group_size - 1;
1953                   vinfo = next;
1954                 }
1955             }
1956         }
1957     }
1958 }
1959
1960
1961 /* Decides whether we need to create an epilogue loop to handle
1962    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1963
1964 void
1965 determine_peel_for_niter (loop_vec_info loop_vinfo)
1966 {
1967   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1968
1969   unsigned HOST_WIDE_INT const_vf;
1970   HOST_WIDE_INT max_niter
1971     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1972
1973   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1974   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1975     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1976                                           (loop_vinfo));
1977
1978   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1979     /* The main loop handles all iterations.  */
1980     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1981   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1982            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1983     {
1984       /* Work out the (constant) number of iterations that need to be
1985          peeled for reasons other than niters.  */
1986       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1987       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1988         peel_niter += 1;
1989       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1990                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1991         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1992     }
1993   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1994            /* ??? When peeling for gaps but not alignment, we could
1995               try to check whether the (variable) niters is known to be
1996               VF * N + 1.  That's something of a niche case though.  */
1997            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1998            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1999            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2000                 < (unsigned) exact_log2 (const_vf))
2001                /* In case of versioning, check if the maximum number of
2002                   iterations is greater than th.  If they are identical,
2003                   the epilogue is unnecessary.  */
2004                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2005                    || ((unsigned HOST_WIDE_INT) max_niter
2006                        > (th / const_vf) * const_vf))))
2007     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2008 }
2009
2010
2011 /* Function vect_analyze_loop_2.
2012
2013    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2014    for it.  The different analyses will record information in the
2015    loop_vec_info struct.  */
2016 static opt_result
2017 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2018 {
2019   opt_result ok = opt_result::success ();
2020   int res;
2021   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2022   poly_uint64 min_vf = 2;
2023   loop_vec_info orig_loop_vinfo = NULL;
2024
2025   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2026      loop_vec_info of the first vectorized loop.  */
2027   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2028     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2029   else
2030     orig_loop_vinfo = loop_vinfo;
2031   gcc_assert (orig_loop_vinfo);
2032
2033   /* The first group of checks is independent of the vector size.  */
2034   fatal = true;
2035
2036   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2037       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2038     return opt_result::failure_at (vect_location,
2039                                    "not vectorized: simd if(0)\n");
2040
2041   /* Find all data references in the loop (which correspond to vdefs/vuses)
2042      and analyze their evolution in the loop.  */
2043
2044   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2045
2046   /* Gather the data references and count stmts in the loop.  */
2047   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2048     {
2049       opt_result res
2050         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2051                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2052                                      n_stmts);
2053       if (!res)
2054         {
2055           if (dump_enabled_p ())
2056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2057                              "not vectorized: loop contains function "
2058                              "calls or data references that cannot "
2059                              "be analyzed\n");
2060           return res;
2061         }
2062       loop_vinfo->shared->save_datarefs ();
2063     }
2064   else
2065     loop_vinfo->shared->check_datarefs ();
2066
2067   /* Analyze the data references and also adjust the minimal
2068      vectorization factor according to the loads and stores.  */
2069
2070   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2071   if (!ok)
2072     {
2073       if (dump_enabled_p ())
2074         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075                          "bad data references.\n");
2076       return ok;
2077     }
2078
2079   /* Classify all cross-iteration scalar data-flow cycles.
2080      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2081   vect_analyze_scalar_cycles (loop_vinfo);
2082
2083   vect_pattern_recog (loop_vinfo);
2084
2085   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2086
2087   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2088      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2089
2090   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2091   if (!ok)
2092     {
2093       if (dump_enabled_p ())
2094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095                          "bad data access.\n");
2096       return ok;
2097     }
2098
2099   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2100
2101   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2102   if (!ok)
2103     {
2104       if (dump_enabled_p ())
2105         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2106                          "unexpected pattern.\n");
2107       return ok;
2108     }
2109
2110   /* While the rest of the analysis below depends on it in some way.  */
2111   fatal = false;
2112
2113   /* Analyze data dependences between the data-refs in the loop
2114      and adjust the maximum vectorization factor according to
2115      the dependences.
2116      FORNOW: fail at the first data dependence that we encounter.  */
2117
2118   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2119   if (!ok)
2120     {
2121       if (dump_enabled_p ())
2122         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123                          "bad data dependence.\n");
2124       return ok;
2125     }
2126   if (max_vf != MAX_VECTORIZATION_FACTOR
2127       && maybe_lt (max_vf, min_vf))
2128     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2129   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2130
2131   ok = vect_determine_vectorization_factor (loop_vinfo);
2132   if (!ok)
2133     {
2134       if (dump_enabled_p ())
2135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                          "can't determine vectorization factor.\n");
2137       return ok;
2138     }
2139   if (max_vf != MAX_VECTORIZATION_FACTOR
2140       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2141     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2142
2143   /* Compute the scalar iteration cost.  */
2144   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2145
2146   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2147
2148   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2149   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2150   if (!ok)
2151     return ok;
2152
2153   /* If there are any SLP instances mark them as pure_slp.  */
2154   bool slp = vect_make_slp_decision (loop_vinfo);
2155   if (slp)
2156     {
2157       /* Find stmts that need to be both vectorized and SLPed.  */
2158       vect_detect_hybrid_slp (loop_vinfo);
2159
2160       /* Update the vectorization factor based on the SLP decision.  */
2161       vect_update_vf_for_slp (loop_vinfo);
2162
2163       /* Optimize the SLP graph with the vectorization factor fixed.  */
2164       vect_optimize_slp (loop_vinfo);
2165     }
2166
2167   bool saved_can_use_partial_vectors_p
2168     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2169
2170   /* We don't expect to have to roll back to anything other than an empty
2171      set of rgroups.  */
2172   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2173
2174   /* This is the point where we can re-start analysis with SLP forced off.  */
2175 start_over:
2176
2177   /* Now the vectorization factor is final.  */
2178   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2179   gcc_assert (known_ne (vectorization_factor, 0U));
2180
2181   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2182     {
2183       dump_printf_loc (MSG_NOTE, vect_location,
2184                        "vectorization_factor = ");
2185       dump_dec (MSG_NOTE, vectorization_factor);
2186       dump_printf (MSG_NOTE, ", niters = %wd\n",
2187                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2188     }
2189
2190   /* Analyze the alignment of the data-refs in the loop.
2191      Fail if a data reference is found that cannot be vectorized.  */
2192
2193   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2194   if (!ok)
2195     {
2196       if (dump_enabled_p ())
2197         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198                          "bad data alignment.\n");
2199       return ok;
2200     }
2201
2202   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2203      It is important to call pruning after vect_analyze_data_ref_accesses,
2204      since we use grouping information gathered by interleaving analysis.  */
2205   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2206   if (!ok)
2207     return ok;
2208
2209   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2210      vectorization, since we do not want to add extra peeling or
2211      add versioning for alignment.  */
2212   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2213     /* This pass will decide on using loop versioning and/or loop peeling in
2214        order to enhance the alignment of data references in the loop.  */
2215     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2216   if (!ok)
2217     return ok;
2218
2219   if (slp)
2220     {
2221       /* Analyze operations in the SLP instances.  Note this may
2222          remove unsupported SLP instances which makes the above
2223          SLP kind detection invalid.  */
2224       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2225       vect_slp_analyze_operations (loop_vinfo);
2226       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2227         {
2228           ok = opt_result::failure_at (vect_location,
2229                                        "unsupported SLP instances\n");
2230           goto again;
2231         }
2232     }
2233
2234   /* Dissolve SLP-only groups.  */
2235   vect_dissolve_slp_only_groups (loop_vinfo);
2236
2237   /* Scan all the remaining operations in the loop that are not subject
2238      to SLP and make sure they are vectorizable.  */
2239   ok = vect_analyze_loop_operations (loop_vinfo);
2240   if (!ok)
2241     {
2242       if (dump_enabled_p ())
2243         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244                          "bad operation or unsupported loop bound.\n");
2245       return ok;
2246     }
2247
2248   /* For now, we don't expect to mix both masking and length approaches for one
2249      loop, disable it if both are recorded.  */
2250   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2251       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2252       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2253     {
2254       if (dump_enabled_p ())
2255         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                          "can't vectorize a loop with partial vectors"
2257                          " because we don't expect to mix different"
2258                          " approaches with partial vectors for the"
2259                          " same loop.\n");
2260       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2261     }
2262
2263   /* Decide whether to vectorize a loop with partial vectors for
2264      this vectorization factor.  */
2265   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2266     {
2267       if (param_vect_partial_vector_usage == 0)
2268         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2269       else if (vect_verify_full_masking (loop_vinfo)
2270                || vect_verify_loop_lens (loop_vinfo))
2271         {
2272           /* The epilogue and other known niters less than VF
2273             cases can still use vector access with length fully.  */
2274           if (param_vect_partial_vector_usage == 1
2275               && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2276               && !vect_known_niters_smaller_than_vf (loop_vinfo))
2277             {
2278               LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2279               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2280             }
2281           else
2282             LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2283         }
2284       else
2285         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2286     }
2287   else
2288     LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2289
2290   if (dump_enabled_p ())
2291     {
2292       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2293         dump_printf_loc (MSG_NOTE, vect_location,
2294                          "operating on partial vectors.\n");
2295       else
2296         dump_printf_loc (MSG_NOTE, vect_location,
2297                          "operating only on full vectors.\n");
2298     }
2299
2300   /* If epilog loop is required because of data accesses with gaps,
2301      one additional iteration needs to be peeled.  Check if there is
2302      enough iterations for vectorization.  */
2303   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2304       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2305       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2306     {
2307       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2308       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2309
2310       if (known_lt (wi::to_widest (scalar_niters), vf))
2311         return opt_result::failure_at (vect_location,
2312                                        "loop has no enough iterations to"
2313                                        " support peeling for gaps.\n");
2314     }
2315
2316   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2317      to be able to handle fewer than VF scalars, or needs to have a lower VF
2318      than the main loop.  */
2319   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2320       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2321       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2322                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2323     return opt_result::failure_at (vect_location,
2324                                    "Vectorization factor too high for"
2325                                    " epilogue loop.\n");
2326
2327   /* Check the costings of the loop make vectorizing worthwhile.  */
2328   res = vect_analyze_loop_costing (loop_vinfo);
2329   if (res < 0)
2330     {
2331       ok = opt_result::failure_at (vect_location,
2332                                    "Loop costings may not be worthwhile.\n");
2333       goto again;
2334     }
2335   if (!res)
2336     return opt_result::failure_at (vect_location,
2337                                    "Loop costings not worthwhile.\n");
2338
2339   determine_peel_for_niter (loop_vinfo);
2340   /* If an epilogue loop is required make sure we can create one.  */
2341   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2342       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2346       if (!vect_can_advance_ivs_p (loop_vinfo)
2347           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2348                                            single_exit (LOOP_VINFO_LOOP
2349                                                          (loop_vinfo))))
2350         {
2351           ok = opt_result::failure_at (vect_location,
2352                                        "not vectorized: can't create required "
2353                                        "epilog loop\n");
2354           goto again;
2355         }
2356     }
2357
2358   /* During peeling, we need to check if number of loop iterations is
2359      enough for both peeled prolog loop and vector loop.  This check
2360      can be merged along with threshold check of loop versioning, so
2361      increase threshold for this case if necessary.
2362
2363      If we are analyzing an epilogue we still want to check what its
2364      versioning threshold would be.  If we decide to vectorize the epilogues we
2365      will want to use the lowest versioning threshold of all epilogues and main
2366      loop.  This will enable us to enter a vectorized epilogue even when
2367      versioning the loop.  We can't simply check whether the epilogue requires
2368      versioning though since we may have skipped some versioning checks when
2369      analyzing the epilogue.  For instance, checks for alias versioning will be
2370      skipped when dealing with epilogues as we assume we already checked them
2371      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2372   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2373     {
2374       poly_uint64 niters_th = 0;
2375       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2376
2377       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2378         {
2379           /* Niters for peeled prolog loop.  */
2380           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2381             {
2382               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2383               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2384               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2385             }
2386           else
2387             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2388         }
2389
2390       /* Niters for at least one iteration of vectorized loop.  */
2391       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2392         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2393       /* One additional iteration because of peeling for gap.  */
2394       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2395         niters_th += 1;
2396
2397       /*  Use the same condition as vect_transform_loop to decide when to use
2398           the cost to determine a versioning threshold.  */
2399       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2400           && ordered_p (th, niters_th))
2401         niters_th = ordered_max (poly_uint64 (th), niters_th);
2402
2403       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2404     }
2405
2406   gcc_assert (known_eq (vectorization_factor,
2407                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2408
2409   /* Ok to vectorize!  */
2410   return opt_result::success ();
2411
2412 again:
2413   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2414   gcc_assert (!ok);
2415
2416   /* Try again with SLP forced off but if we didn't do any SLP there is
2417      no point in re-trying.  */
2418   if (!slp)
2419     return ok;
2420
2421   /* If there are reduction chains re-trying will fail anyway.  */
2422   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2423     return ok;
2424
2425   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2426      via interleaving or lane instructions.  */
2427   slp_instance instance;
2428   slp_tree node;
2429   unsigned i, j;
2430   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2431     {
2432       stmt_vec_info vinfo;
2433       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2434       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2435         continue;
2436       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2437       unsigned int size = DR_GROUP_SIZE (vinfo);
2438       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2439       if (! vect_store_lanes_supported (vectype, size, false)
2440          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2441          && ! vect_grouped_store_supported (vectype, size))
2442         return opt_result::failure_at (vinfo->stmt,
2443                                        "unsupported grouped store\n");
2444       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2445         {
2446           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2447           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2448           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2449           size = DR_GROUP_SIZE (vinfo);
2450           vectype = STMT_VINFO_VECTYPE (vinfo);
2451           if (! vect_load_lanes_supported (vectype, size, false)
2452               && ! vect_grouped_load_supported (vectype, single_element_p,
2453                                                 size))
2454             return opt_result::failure_at (vinfo->stmt,
2455                                            "unsupported grouped load\n");
2456         }
2457     }
2458
2459   if (dump_enabled_p ())
2460     dump_printf_loc (MSG_NOTE, vect_location,
2461                      "re-trying with SLP disabled\n");
2462
2463   /* Roll back state appropriately.  No SLP this time.  */
2464   slp = false;
2465   /* Restore vectorization factor as it were without SLP.  */
2466   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2467   /* Free the SLP instances.  */
2468   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2469     vect_free_slp_instance (instance, false);
2470   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2471   /* Reset SLP type to loop_vect on all stmts.  */
2472   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2473     {
2474       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2475       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2476            !gsi_end_p (si); gsi_next (&si))
2477         {
2478           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2479           STMT_SLP_TYPE (stmt_info) = loop_vect;
2480           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2481               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2482             {
2483               /* vectorizable_reduction adjusts reduction stmt def-types,
2484                  restore them to that of the PHI.  */
2485               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2486                 = STMT_VINFO_DEF_TYPE (stmt_info);
2487               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2488                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2489                 = STMT_VINFO_DEF_TYPE (stmt_info);
2490             }
2491         }
2492       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2493            !gsi_end_p (si); gsi_next (&si))
2494         {
2495           if (is_gimple_debug (gsi_stmt (si)))
2496             continue;
2497           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2498           STMT_SLP_TYPE (stmt_info) = loop_vect;
2499           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2500             {
2501               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2502               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2503               STMT_SLP_TYPE (stmt_info) = loop_vect;
2504               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2505                    !gsi_end_p (pi); gsi_next (&pi))
2506                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2507                   = loop_vect;
2508             }
2509         }
2510     }
2511   /* Free optimized alias test DDRS.  */
2512   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2513   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2514   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2515   /* Reset target cost data.  */
2516   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2517   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2518     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2519   /* Reset accumulated rgroup information.  */
2520   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2521   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2522   /* Reset assorted flags.  */
2523   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2524   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2525   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2526   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2527   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2528     = saved_can_use_partial_vectors_p;
2529
2530   goto start_over;
2531 }
2532
2533 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2534    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2535    OLD_LOOP_VINFO is better unless something specifically indicates
2536    otherwise.
2537
2538    Note that this deliberately isn't a partial order.  */
2539
2540 static bool
2541 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2542                           loop_vec_info old_loop_vinfo)
2543 {
2544   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2545   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2546
2547   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2548   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2549
2550   /* Always prefer a VF of loop->simdlen over any other VF.  */
2551   if (loop->simdlen)
2552     {
2553       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2554       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2555       if (new_simdlen_p != old_simdlen_p)
2556         return new_simdlen_p;
2557     }
2558
2559   /* Limit the VFs to what is likely to be the maximum number of iterations,
2560      to handle cases in which at least one loop_vinfo is fully-masked.  */
2561   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2562   if (estimated_max_niter != -1)
2563     {
2564       if (known_le (estimated_max_niter, new_vf))
2565         new_vf = estimated_max_niter;
2566       if (known_le (estimated_max_niter, old_vf))
2567         old_vf = estimated_max_niter;
2568     }
2569
2570   /* Check whether the (fractional) cost per scalar iteration is lower
2571      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2572   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2573                              * poly_widest_int (old_vf));
2574   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2575                              * poly_widest_int (new_vf));
2576   if (maybe_lt (rel_old, rel_new))
2577     {
2578       /* When old_loop_vinfo uses a variable vectorization factor,
2579          we know that it has a lower cost for at least one runtime VF.
2580          However, we don't know how likely that VF is.
2581
2582          One option would be to compare the costs for the estimated VFs.
2583          The problem is that that can put too much pressure on the cost
2584          model.  E.g. if the estimated VF is also the lowest possible VF,
2585          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2586          for the estimated VF, we'd then choose new_loop_vinfo even
2587          though (a) new_loop_vinfo might not actually be better than
2588          old_loop_vinfo for that VF and (b) it would be significantly
2589          worse at larger VFs.
2590
2591          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2592          no more expensive than old_loop_vinfo even after doubling the
2593          estimated old_loop_vinfo VF.  For all but trivial loops, this
2594          ensures that we only pick new_loop_vinfo if it is significantly
2595          better than old_loop_vinfo at the estimated VF.  */
2596       if (rel_new.is_constant ())
2597         return false;
2598
2599       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2600       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2601       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2602                                       * widest_int (old_estimated_vf));
2603       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2604                                       * widest_int (new_estimated_vf));
2605       return estimated_rel_new * 2 <= estimated_rel_old;
2606     }
2607   if (known_lt (rel_new, rel_old))
2608     return true;
2609
2610   /* If there's nothing to choose between the loop bodies, see whether
2611      there's a difference in the prologue and epilogue costs.  */
2612   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2613     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2614
2615   return false;
2616 }
2617
2618 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2619    true if we should.  */
2620
2621 static bool
2622 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2623                         loop_vec_info old_loop_vinfo)
2624 {
2625   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2626     return false;
2627
2628   if (dump_enabled_p ())
2629     dump_printf_loc (MSG_NOTE, vect_location,
2630                      "***** Preferring vector mode %s to vector mode %s\n",
2631                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2632                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2633   return true;
2634 }
2635
2636 /* Function vect_analyze_loop.
2637
2638    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2639    for it.  The different analyses will record information in the
2640    loop_vec_info struct.  */
2641 opt_loop_vec_info
2642 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2643 {
2644   auto_vector_modes vector_modes;
2645
2646   /* Autodetect first vector size we try.  */
2647   unsigned int autovec_flags
2648     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2649                                                     loop->simdlen != 0);
2650   unsigned int mode_i = 0;
2651
2652   DUMP_VECT_SCOPE ("analyze_loop_nest");
2653
2654   if (loop_outer (loop)
2655       && loop_vec_info_for_loop (loop_outer (loop))
2656       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2657     return opt_loop_vec_info::failure_at (vect_location,
2658                                           "outer-loop already vectorized.\n");
2659
2660   if (!find_loop_nest (loop, &shared->loop_nest))
2661     return opt_loop_vec_info::failure_at
2662       (vect_location,
2663        "not vectorized: loop nest containing two or more consecutive inner"
2664        " loops cannot be vectorized\n");
2665
2666   unsigned n_stmts = 0;
2667   machine_mode autodetected_vector_mode = VOIDmode;
2668   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2669   machine_mode next_vector_mode = VOIDmode;
2670   poly_uint64 lowest_th = 0;
2671   unsigned vectorized_loops = 0;
2672   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2673                              && !unlimited_cost_model (loop));
2674
2675   bool vect_epilogues = false;
2676   opt_result res = opt_result::success ();
2677   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2678   while (1)
2679     {
2680       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2681       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2682       if (!loop_vinfo)
2683         {
2684           if (dump_enabled_p ())
2685             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2686                              "bad loop form.\n");
2687           gcc_checking_assert (first_loop_vinfo == NULL);
2688           return loop_vinfo;
2689         }
2690       loop_vinfo->vector_mode = next_vector_mode;
2691
2692       bool fatal = false;
2693
2694       /* When pick_lowest_cost_p is true, we should in principle iterate
2695          over all the loop_vec_infos that LOOP_VINFO could replace and
2696          try to vectorize LOOP_VINFO under the same conditions.
2697          E.g. when trying to replace an epilogue loop, we should vectorize
2698          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2699          to replace the main loop, we should vectorize LOOP_VINFO as a main
2700          loop too.
2701
2702          However, autovectorize_vector_modes is usually sorted as follows:
2703
2704          - Modes that naturally produce lower VFs usually follow modes that
2705            naturally produce higher VFs.
2706
2707          - When modes naturally produce the same VF, maskable modes
2708            usually follow unmaskable ones, so that the maskable mode
2709            can be used to vectorize the epilogue of the unmaskable mode.
2710
2711          This order is preferred because it leads to the maximum
2712          epilogue vectorization opportunities.  Targets should only use
2713          a different order if they want to make wide modes available while
2714          disparaging them relative to earlier, smaller modes.  The assumption
2715          in that case is that the wider modes are more expensive in some
2716          way that isn't reflected directly in the costs.
2717
2718          There should therefore be few interesting cases in which
2719          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2720          treated as a standalone loop, and ends up being genuinely cheaper
2721          than FIRST_LOOP_VINFO.  */
2722       if (vect_epilogues)
2723         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2724
2725       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2726       if (mode_i == 0)
2727         autodetected_vector_mode = loop_vinfo->vector_mode;
2728       if (dump_enabled_p ())
2729         {
2730           if (res)
2731             dump_printf_loc (MSG_NOTE, vect_location,
2732                              "***** Analysis succeeded with vector mode %s\n",
2733                              GET_MODE_NAME (loop_vinfo->vector_mode));
2734           else
2735             dump_printf_loc (MSG_NOTE, vect_location,
2736                              "***** Analysis failed with vector mode %s\n",
2737                              GET_MODE_NAME (loop_vinfo->vector_mode));
2738         }
2739
2740       loop->aux = NULL;
2741
2742       if (!fatal)
2743         while (mode_i < vector_modes.length ()
2744                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2745           {
2746             if (dump_enabled_p ())
2747               dump_printf_loc (MSG_NOTE, vect_location,
2748                                "***** The result for vector mode %s would"
2749                                " be the same\n",
2750                                GET_MODE_NAME (vector_modes[mode_i]));
2751             mode_i += 1;
2752           }
2753
2754       if (res)
2755         {
2756           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2757           vectorized_loops++;
2758
2759           /* Once we hit the desired simdlen for the first time,
2760              discard any previous attempts.  */
2761           if (simdlen
2762               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2763             {
2764               delete first_loop_vinfo;
2765               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2766               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2767               simdlen = 0;
2768             }
2769           else if (pick_lowest_cost_p && first_loop_vinfo)
2770             {
2771               /* Keep trying to roll back vectorization attempts while the
2772                  loop_vec_infos they produced were worse than this one.  */
2773               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2774               while (!vinfos.is_empty ()
2775                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2776                 {
2777                   gcc_assert (vect_epilogues);
2778                   delete vinfos.pop ();
2779                 }
2780               if (vinfos.is_empty ()
2781                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2782                 {
2783                   delete first_loop_vinfo;
2784                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2785                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2786                 }
2787             }
2788
2789           if (first_loop_vinfo == NULL)
2790             {
2791               first_loop_vinfo = loop_vinfo;
2792               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2793             }
2794           else if (vect_epilogues
2795                    /* For now only allow one epilogue loop.  */
2796                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2797             {
2798               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2799               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2800               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2801                           || maybe_ne (lowest_th, 0U));
2802               /* Keep track of the known smallest versioning
2803                  threshold.  */
2804               if (ordered_p (lowest_th, th))
2805                 lowest_th = ordered_min (lowest_th, th);
2806             }
2807           else
2808             {
2809               delete loop_vinfo;
2810               loop_vinfo = opt_loop_vec_info::success (NULL);
2811             }
2812
2813           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2814              enabled, SIMDUID is not set, it is the innermost loop and we have
2815              either already found the loop's SIMDLEN or there was no SIMDLEN to
2816              begin with.
2817              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2818           vect_epilogues = (!simdlen
2819                             && loop->inner == NULL
2820                             && param_vect_epilogues_nomask
2821                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2822                             && !loop->simduid
2823                             /* For now only allow one epilogue loop, but allow
2824                                pick_lowest_cost_p to replace it.  */
2825                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2826                                 || pick_lowest_cost_p));
2827
2828           /* Commit to first_loop_vinfo if we have no reason to try
2829              alternatives.  */
2830           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2831             break;
2832         }
2833       else
2834         {
2835           delete loop_vinfo;
2836           loop_vinfo = opt_loop_vec_info::success (NULL);
2837           if (fatal)
2838             {
2839               gcc_checking_assert (first_loop_vinfo == NULL);
2840               break;
2841             }
2842         }
2843
2844       /* Handle the case that the original loop can use partial
2845          vectorization, but want to only adopt it for the epilogue.
2846          The retry should be in the same mode as original.  */
2847       if (vect_epilogues
2848           && loop_vinfo
2849           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2850         {
2851           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2852                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2853           if (dump_enabled_p ())
2854             dump_printf_loc (MSG_NOTE, vect_location,
2855                              "***** Re-trying analysis with same vector mode"
2856                              " %s for epilogue with partial vectors.\n",
2857                              GET_MODE_NAME (loop_vinfo->vector_mode));
2858           continue;
2859         }
2860
2861       if (mode_i < vector_modes.length ()
2862           && VECTOR_MODE_P (autodetected_vector_mode)
2863           && (related_vector_mode (vector_modes[mode_i],
2864                                    GET_MODE_INNER (autodetected_vector_mode))
2865               == autodetected_vector_mode)
2866           && (related_vector_mode (autodetected_vector_mode,
2867                                    GET_MODE_INNER (vector_modes[mode_i]))
2868               == vector_modes[mode_i]))
2869         {
2870           if (dump_enabled_p ())
2871             dump_printf_loc (MSG_NOTE, vect_location,
2872                              "***** Skipping vector mode %s, which would"
2873                              " repeat the analysis for %s\n",
2874                              GET_MODE_NAME (vector_modes[mode_i]),
2875                              GET_MODE_NAME (autodetected_vector_mode));
2876           mode_i += 1;
2877         }
2878
2879       if (mode_i == vector_modes.length ()
2880           || autodetected_vector_mode == VOIDmode)
2881         break;
2882
2883       /* Try the next biggest vector size.  */
2884       next_vector_mode = vector_modes[mode_i++];
2885       if (dump_enabled_p ())
2886         dump_printf_loc (MSG_NOTE, vect_location,
2887                          "***** Re-trying analysis with vector mode %s\n",
2888                          GET_MODE_NAME (next_vector_mode));
2889     }
2890
2891   if (first_loop_vinfo)
2892     {
2893       loop->aux = (loop_vec_info) first_loop_vinfo;
2894       if (dump_enabled_p ())
2895         dump_printf_loc (MSG_NOTE, vect_location,
2896                          "***** Choosing vector mode %s\n",
2897                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2898       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2899       return first_loop_vinfo;
2900     }
2901
2902   return opt_loop_vec_info::propagate_failure (res);
2903 }
2904
2905 /* Return true if there is an in-order reduction function for CODE, storing
2906    it in *REDUC_FN if so.  */
2907
2908 static bool
2909 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2910 {
2911   switch (code)
2912     {
2913     case PLUS_EXPR:
2914       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2915       return true;
2916
2917     default:
2918       return false;
2919     }
2920 }
2921
2922 /* Function reduction_fn_for_scalar_code
2923
2924    Input:
2925    CODE - tree_code of a reduction operations.
2926
2927    Output:
2928    REDUC_FN - the corresponding internal function to be used to reduce the
2929       vector of partial results into a single scalar result, or IFN_LAST
2930       if the operation is a supported reduction operation, but does not have
2931       such an internal function.
2932
2933    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2934
2935 static bool
2936 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2937 {
2938   switch (code)
2939     {
2940       case MAX_EXPR:
2941         *reduc_fn = IFN_REDUC_MAX;
2942         return true;
2943
2944       case MIN_EXPR:
2945         *reduc_fn = IFN_REDUC_MIN;
2946         return true;
2947
2948       case PLUS_EXPR:
2949         *reduc_fn = IFN_REDUC_PLUS;
2950         return true;
2951
2952       case BIT_AND_EXPR:
2953         *reduc_fn = IFN_REDUC_AND;
2954         return true;
2955
2956       case BIT_IOR_EXPR:
2957         *reduc_fn = IFN_REDUC_IOR;
2958         return true;
2959
2960       case BIT_XOR_EXPR:
2961         *reduc_fn = IFN_REDUC_XOR;
2962         return true;
2963
2964       case MULT_EXPR:
2965       case MINUS_EXPR:
2966         *reduc_fn = IFN_LAST;
2967         return true;
2968
2969       default:
2970        return false;
2971     }
2972 }
2973
2974 /* If there is a neutral value X such that SLP reduction NODE would not
2975    be affected by the introduction of additional X elements, return that X,
2976    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2977    is the vector type that would hold element X.  REDUC_CHAIN is true if
2978    the SLP statements perform a single reduction, false if each statement
2979    performs an independent reduction.  */
2980
2981 static tree
2982 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2983                               tree_code code, bool reduc_chain)
2984 {
2985   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2986   stmt_vec_info stmt_vinfo = stmts[0];
2987   tree scalar_type = TREE_TYPE (vector_type);
2988   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2989   gcc_assert (loop);
2990
2991   switch (code)
2992     {
2993     case WIDEN_SUM_EXPR:
2994     case DOT_PROD_EXPR:
2995     case SAD_EXPR:
2996     case PLUS_EXPR:
2997     case MINUS_EXPR:
2998     case BIT_IOR_EXPR:
2999     case BIT_XOR_EXPR:
3000       return build_zero_cst (scalar_type);
3001
3002     case MULT_EXPR:
3003       return build_one_cst (scalar_type);
3004
3005     case BIT_AND_EXPR:
3006       return build_all_ones_cst (scalar_type);
3007
3008     case MAX_EXPR:
3009     case MIN_EXPR:
3010       /* For MIN/MAX the initial values are neutral.  A reduction chain
3011          has only a single initial value, so that value is neutral for
3012          all statements.  */
3013       if (reduc_chain)
3014         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3015                                       loop_preheader_edge (loop));
3016       return NULL_TREE;
3017
3018     default:
3019       return NULL_TREE;
3020     }
3021 }
3022
3023 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3024    STMT is printed with a message MSG. */
3025
3026 static void
3027 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3028 {
3029   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3030 }
3031
3032 /* Return true if we need an in-order reduction for operation CODE
3033    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3034    overflow must wrap.  */
3035
3036 bool
3037 needs_fold_left_reduction_p (tree type, tree_code code)
3038 {
3039   /* CHECKME: check for !flag_finite_math_only too?  */
3040   if (SCALAR_FLOAT_TYPE_P (type))
3041     switch (code)
3042       {
3043       case MIN_EXPR:
3044       case MAX_EXPR:
3045         return false;
3046
3047       default:
3048         return !flag_associative_math;
3049       }
3050
3051   if (INTEGRAL_TYPE_P (type))
3052     {
3053       if (!operation_no_trapping_overflow (type, code))
3054         return true;
3055       return false;
3056     }
3057
3058   if (SAT_FIXED_POINT_TYPE_P (type))
3059     return true;
3060
3061   return false;
3062 }
3063
3064 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3065    has a handled computation expression.  Store the main reduction
3066    operation in *CODE.  */
3067
3068 static bool
3069 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3070                       tree loop_arg, enum tree_code *code,
3071                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3072 {
3073   auto_bitmap visited;
3074   tree lookfor = PHI_RESULT (phi);
3075   ssa_op_iter curri;
3076   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3077   while (USE_FROM_PTR (curr) != loop_arg)
3078     curr = op_iter_next_use (&curri);
3079   curri.i = curri.numops;
3080   do
3081     {
3082       path.safe_push (std::make_pair (curri, curr));
3083       tree use = USE_FROM_PTR (curr);
3084       if (use == lookfor)
3085         break;
3086       gimple *def = SSA_NAME_DEF_STMT (use);
3087       if (gimple_nop_p (def)
3088           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3089         {
3090 pop:
3091           do
3092             {
3093               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3094               curri = x.first;
3095               curr = x.second;
3096               do
3097                 curr = op_iter_next_use (&curri);
3098               /* Skip already visited or non-SSA operands (from iterating
3099                  over PHI args).  */
3100               while (curr != NULL_USE_OPERAND_P
3101                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3102                          || ! bitmap_set_bit (visited,
3103                                               SSA_NAME_VERSION
3104                                                 (USE_FROM_PTR (curr)))));
3105             }
3106           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3107           if (curr == NULL_USE_OPERAND_P)
3108             break;
3109         }
3110       else
3111         {
3112           if (gimple_code (def) == GIMPLE_PHI)
3113             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3114           else
3115             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3116           while (curr != NULL_USE_OPERAND_P
3117                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3118                      || ! bitmap_set_bit (visited,
3119                                           SSA_NAME_VERSION
3120                                             (USE_FROM_PTR (curr)))))
3121             curr = op_iter_next_use (&curri);
3122           if (curr == NULL_USE_OPERAND_P)
3123             goto pop;
3124         }
3125     }
3126   while (1);
3127   if (dump_file && (dump_flags & TDF_DETAILS))
3128     {
3129       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3130       unsigned i;
3131       std::pair<ssa_op_iter, use_operand_p> *x;
3132       FOR_EACH_VEC_ELT (path, i, x)
3133         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3134       dump_printf (MSG_NOTE, "\n");
3135     }
3136
3137   /* Check whether the reduction path detected is valid.  */
3138   bool fail = path.length () == 0;
3139   bool neg = false;
3140   int sign = -1;
3141   *code = ERROR_MARK;
3142   for (unsigned i = 1; i < path.length (); ++i)
3143     {
3144       gimple *use_stmt = USE_STMT (path[i].second);
3145       tree op = USE_FROM_PTR (path[i].second);
3146       if (! is_gimple_assign (use_stmt)
3147           /* The following make sure we can compute the operand index
3148              easily plus it mostly disallows chaining via COND_EXPR condition
3149              operands.  */
3150           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3151               && (gimple_num_ops (use_stmt) <= 2
3152                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3153               && (gimple_num_ops (use_stmt) <= 3
3154                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3155         {
3156           fail = true;
3157           break;
3158         }
3159       /* Check there's only a single stmt the op is used on inside
3160          of the loop.  */
3161       imm_use_iterator imm_iter;
3162       gimple *op_use_stmt;
3163       unsigned cnt = 0;
3164       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3165         if (!is_gimple_debug (op_use_stmt)
3166             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3167           {
3168             /* We want to allow x + x but not x < 1 ? x : 2.  */
3169             if (is_gimple_assign (op_use_stmt)
3170                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3171               {
3172                 use_operand_p use_p;
3173                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3174                   cnt++;
3175               }
3176             else
3177               cnt++;
3178           }
3179       if (cnt != 1)
3180         {
3181           fail = true;
3182           break;
3183         }
3184       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3185       if (use_code == MINUS_EXPR)
3186         {
3187           use_code = PLUS_EXPR;
3188           /* Track whether we negate the reduction value each iteration.  */
3189           if (gimple_assign_rhs2 (use_stmt) == op)
3190             neg = ! neg;
3191         }
3192       if (CONVERT_EXPR_CODE_P (use_code)
3193           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3194                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3195         ;
3196       else if (*code == ERROR_MARK)
3197         {
3198           *code = use_code;
3199           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3200         }
3201       else if (use_code != *code)
3202         {
3203           fail = true;
3204           break;
3205         }
3206       else if ((use_code == MIN_EXPR
3207                 || use_code == MAX_EXPR)
3208                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3209         {
3210           fail = true;
3211           break;
3212         }
3213     }
3214   return ! fail && ! neg && *code != ERROR_MARK;
3215 }
3216
3217 bool
3218 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3219                       tree loop_arg, enum tree_code code)
3220 {
3221   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3222   enum tree_code code_;
3223   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3224           && code_ == code);
3225 }
3226
3227
3228
3229 /* Function vect_is_simple_reduction
3230
3231    (1) Detect a cross-iteration def-use cycle that represents a simple
3232    reduction computation.  We look for the following pattern:
3233
3234    loop_header:
3235      a1 = phi < a0, a2 >
3236      a3 = ...
3237      a2 = operation (a3, a1)
3238
3239    or
3240
3241    a3 = ...
3242    loop_header:
3243      a1 = phi < a0, a2 >
3244      a2 = operation (a3, a1)
3245
3246    such that:
3247    1. operation is commutative and associative and it is safe to
3248       change the order of the computation
3249    2. no uses for a2 in the loop (a2 is used out of the loop)
3250    3. no uses of a1 in the loop besides the reduction operation
3251    4. no uses of a1 outside the loop.
3252
3253    Conditions 1,4 are tested here.
3254    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3255
3256    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3257    nested cycles.
3258
3259    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3260    reductions:
3261
3262      a1 = phi < a0, a2 >
3263      inner loop (def of a3)
3264      a2 = phi < a3 >
3265
3266    (4) Detect condition expressions, ie:
3267      for (int i = 0; i < N; i++)
3268        if (a[i] < val)
3269         ret_val = a[i];
3270
3271 */
3272
3273 static stmt_vec_info
3274 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3275                           bool *double_reduc, bool *reduc_chain_p)
3276 {
3277   gphi *phi = as_a <gphi *> (phi_info->stmt);
3278   gimple *phi_use_stmt = NULL;
3279   imm_use_iterator imm_iter;
3280   use_operand_p use_p;
3281
3282   *double_reduc = false;
3283   *reduc_chain_p = false;
3284   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3285
3286   tree phi_name = PHI_RESULT (phi);
3287   /* ???  If there are no uses of the PHI result the inner loop reduction
3288      won't be detected as possibly double-reduction by vectorizable_reduction
3289      because that tries to walk the PHI arg from the preheader edge which
3290      can be constant.  See PR60382.  */
3291   if (has_zero_uses (phi_name))
3292     return NULL;
3293   class loop *loop = (gimple_bb (phi))->loop_father;
3294   unsigned nphi_def_loop_uses = 0;
3295   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3296     {
3297       gimple *use_stmt = USE_STMT (use_p);
3298       if (is_gimple_debug (use_stmt))
3299         continue;
3300
3301       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3302         {
3303           if (dump_enabled_p ())
3304             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3305                              "intermediate value used outside loop.\n");
3306
3307           return NULL;
3308         }
3309
3310       nphi_def_loop_uses++;
3311       phi_use_stmt = use_stmt;
3312     }
3313
3314   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3315   if (TREE_CODE (latch_def) != SSA_NAME)
3316     {
3317       if (dump_enabled_p ())
3318         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3319                          "reduction: not ssa_name: %T\n", latch_def);
3320       return NULL;
3321     }
3322
3323   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3324   if (!def_stmt_info
3325       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3326     return NULL;
3327
3328   bool nested_in_vect_loop
3329     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3330   unsigned nlatch_def_loop_uses = 0;
3331   auto_vec<gphi *, 3> lcphis;
3332   bool inner_loop_of_double_reduc = false;
3333   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3334     {
3335       gimple *use_stmt = USE_STMT (use_p);
3336       if (is_gimple_debug (use_stmt))
3337         continue;
3338       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3339         nlatch_def_loop_uses++;
3340       else
3341         {
3342           /* We can have more than one loop-closed PHI.  */
3343           lcphis.safe_push (as_a <gphi *> (use_stmt));
3344           if (nested_in_vect_loop
3345               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3346                   == vect_double_reduction_def))
3347             inner_loop_of_double_reduc = true;
3348         }
3349     }
3350
3351   /* If we are vectorizing an inner reduction we are executing that
3352      in the original order only in case we are not dealing with a
3353      double reduction.  */
3354   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3355     {
3356       if (dump_enabled_p ())
3357         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3358                         "detected nested cycle: ");
3359       return def_stmt_info;
3360     }
3361
3362   /* If this isn't a nested cycle or if the nested cycle reduction value
3363      is used ouside of the inner loop we cannot handle uses of the reduction
3364      value.  */
3365   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3366     {
3367       if (dump_enabled_p ())
3368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3369                          "reduction used in loop.\n");
3370       return NULL;
3371     }
3372
3373   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3374      defined in the inner loop.  */
3375   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3376     {
3377       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3378       if (gimple_phi_num_args (def_stmt) != 1
3379           || TREE_CODE (op1) != SSA_NAME)
3380         {
3381           if (dump_enabled_p ())
3382             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3383                              "unsupported phi node definition.\n");
3384
3385           return NULL;
3386         }
3387
3388       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3389       if (gimple_bb (def1)
3390           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3391           && loop->inner
3392           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3393           && is_gimple_assign (def1)
3394           && is_a <gphi *> (phi_use_stmt)
3395           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3396         {
3397           if (dump_enabled_p ())
3398             report_vect_op (MSG_NOTE, def_stmt,
3399                             "detected double reduction: ");
3400
3401           *double_reduc = true;
3402           return def_stmt_info;
3403         }
3404
3405       return NULL;
3406     }
3407
3408   /* Look for the expression computing latch_def from then loop PHI result.  */
3409   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3410   enum tree_code code;
3411   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3412                             path))
3413     {
3414       STMT_VINFO_REDUC_CODE (phi_info) = code;
3415       if (code == COND_EXPR && !nested_in_vect_loop)
3416         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3417
3418       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3419          reduction chain for which the additional restriction is that
3420          all operations in the chain are the same.  */
3421       auto_vec<stmt_vec_info, 8> reduc_chain;
3422       unsigned i;
3423       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3424       for (i = path.length () - 1; i >= 1; --i)
3425         {
3426           gimple *stmt = USE_STMT (path[i].second);
3427           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3428           STMT_VINFO_REDUC_IDX (stmt_info)
3429             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3430           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3431           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3432                                      && (i == 1 || i == path.length () - 1));
3433           if ((stmt_code != code && !leading_conversion)
3434               /* We can only handle the final value in epilogue
3435                  generation for reduction chains.  */
3436               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3437             is_slp_reduc = false;
3438           /* For reduction chains we support a trailing/leading
3439              conversions.  We do not store those in the actual chain.  */
3440           if (leading_conversion)
3441             continue;
3442           reduc_chain.safe_push (stmt_info);
3443         }
3444       if (is_slp_reduc && reduc_chain.length () > 1)
3445         {
3446           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3447             {
3448               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3449               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3450             }
3451           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3452           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3453
3454           /* Save the chain for further analysis in SLP detection.  */
3455           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3456           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3457
3458           *reduc_chain_p = true;
3459           if (dump_enabled_p ())
3460             dump_printf_loc (MSG_NOTE, vect_location,
3461                             "reduction: detected reduction chain\n");
3462         }
3463       else if (dump_enabled_p ())
3464         dump_printf_loc (MSG_NOTE, vect_location,
3465                          "reduction: detected reduction\n");
3466
3467       return def_stmt_info;
3468     }
3469
3470   if (dump_enabled_p ())
3471     dump_printf_loc (MSG_NOTE, vect_location,
3472                      "reduction: unknown pattern\n");
3473
3474   return NULL;
3475 }
3476
3477 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3478    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3479    or -1 if not known.  */
3480
3481 static int
3482 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3483 {
3484   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3485   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3486     {
3487       if (dump_enabled_p ())
3488         dump_printf_loc (MSG_NOTE, vect_location,
3489                          "cost model: epilogue peel iters set to vf/2 "
3490                          "because loop iterations are unknown .\n");
3491       return assumed_vf / 2;
3492     }
3493   else
3494     {
3495       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3496       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3497       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3498       /* If we need to peel for gaps, but no peeling is required, we have to
3499          peel VF iterations.  */
3500       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3501         peel_iters_epilogue = assumed_vf;
3502       return peel_iters_epilogue;
3503     }
3504 }
3505
3506 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3507 int
3508 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3509                              int *peel_iters_epilogue,
3510                              stmt_vector_for_cost *scalar_cost_vec,
3511                              stmt_vector_for_cost *prologue_cost_vec,
3512                              stmt_vector_for_cost *epilogue_cost_vec)
3513 {
3514   int retval = 0;
3515
3516   *peel_iters_epilogue
3517     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3518
3519   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3520     {
3521       /* If peeled iterations are known but number of scalar loop
3522          iterations are unknown, count a taken branch per peeled loop.  */
3523       if (peel_iters_prologue > 0)
3524         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3525                                    NULL, NULL_TREE, 0, vect_prologue);
3526       if (*peel_iters_epilogue > 0)
3527         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3528                                     NULL, NULL_TREE, 0, vect_epilogue);
3529     }
3530
3531   stmt_info_for_cost *si;
3532   int j;
3533   if (peel_iters_prologue)
3534     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3535       retval += record_stmt_cost (prologue_cost_vec,
3536                                   si->count * peel_iters_prologue,
3537                                   si->kind, si->stmt_info, si->misalign,
3538                                   vect_prologue);
3539   if (*peel_iters_epilogue)
3540     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3541       retval += record_stmt_cost (epilogue_cost_vec,
3542                                   si->count * *peel_iters_epilogue,
3543                                   si->kind, si->stmt_info, si->misalign,
3544                                   vect_epilogue);
3545
3546   return retval;
3547 }
3548
3549 /* Function vect_estimate_min_profitable_iters
3550
3551    Return the number of iterations required for the vector version of the
3552    loop to be profitable relative to the cost of the scalar version of the
3553    loop.
3554
3555    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3556    of iterations for vectorization.  -1 value means loop vectorization
3557    is not profitable.  This returned value may be used for dynamic
3558    profitability check.
3559
3560    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3561    for static check against estimated number of iterations.  */
3562
3563 static void
3564 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3565                                     int *ret_min_profitable_niters,
3566                                     int *ret_min_profitable_estimate)
3567 {
3568   int min_profitable_iters;
3569   int min_profitable_estimate;
3570   int peel_iters_prologue;
3571   int peel_iters_epilogue;
3572   unsigned vec_inside_cost = 0;
3573   int vec_outside_cost = 0;
3574   unsigned vec_prologue_cost = 0;
3575   unsigned vec_epilogue_cost = 0;
3576   int scalar_single_iter_cost = 0;
3577   int scalar_outside_cost = 0;
3578   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3579   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3580   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3581
3582   /* Cost model disabled.  */
3583   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3584     {
3585       if (dump_enabled_p ())
3586         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3587       *ret_min_profitable_niters = 0;
3588       *ret_min_profitable_estimate = 0;
3589       return;
3590     }
3591
3592   /* Requires loop versioning tests to handle misalignment.  */
3593   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3594     {
3595       /*  FIXME: Make cost depend on complexity of individual check.  */
3596       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3597       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3598                             NULL, NULL_TREE, 0, vect_prologue);
3599       if (dump_enabled_p ())
3600         dump_printf (MSG_NOTE,
3601                      "cost model: Adding cost of checks for loop "
3602                      "versioning to treat misalignment.\n");
3603     }
3604
3605   /* Requires loop versioning with alias checks.  */
3606   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3607     {
3608       /*  FIXME: Make cost depend on complexity of individual check.  */
3609       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3610       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3611                             NULL, NULL_TREE, 0, vect_prologue);
3612       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3613       if (len)
3614         /* Count LEN - 1 ANDs and LEN comparisons.  */
3615         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3616                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3617       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3618       if (len)
3619         {
3620           /* Count LEN - 1 ANDs and LEN comparisons.  */
3621           unsigned int nstmts = len * 2 - 1;
3622           /* +1 for each bias that needs adding.  */
3623           for (unsigned int i = 0; i < len; ++i)
3624             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3625               nstmts += 1;
3626           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3627                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3628         }
3629       if (dump_enabled_p ())
3630         dump_printf (MSG_NOTE,
3631                      "cost model: Adding cost of checks for loop "
3632                      "versioning aliasing.\n");
3633     }
3634
3635   /* Requires loop versioning with niter checks.  */
3636   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3637     {
3638       /*  FIXME: Make cost depend on complexity of individual check.  */
3639       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3640                             NULL, NULL_TREE, 0, vect_prologue);
3641       if (dump_enabled_p ())
3642         dump_printf (MSG_NOTE,
3643                      "cost model: Adding cost of checks for loop "
3644                      "versioning niters.\n");
3645     }
3646
3647   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3648     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3649                           NULL, NULL_TREE, 0, vect_prologue);
3650
3651   /* Count statements in scalar loop.  Using this as scalar cost for a single
3652      iteration for now.
3653
3654      TODO: Add outer loop support.
3655
3656      TODO: Consider assigning different costs to different scalar
3657      statements.  */
3658
3659   scalar_single_iter_cost
3660     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3661
3662   /* Add additional cost for the peeled instructions in prologue and epilogue
3663      loop.  (For fully-masked loops there will be no peeling.)
3664
3665      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3666      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3667
3668      TODO: Build an expression that represents peel_iters for prologue and
3669      epilogue to be used in a run-time test.  */
3670
3671   bool prologue_need_br_taken_cost = false;
3672   bool prologue_need_br_not_taken_cost = false;
3673
3674   /* Calculate peel_iters_prologue.  */
3675   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3676     peel_iters_prologue = 0;
3677   else if (npeel < 0)
3678     {
3679       peel_iters_prologue = assumed_vf / 2;
3680       if (dump_enabled_p ())
3681         dump_printf (MSG_NOTE, "cost model: "
3682                      "prologue peel iters set to vf/2.\n");
3683
3684       /* If peeled iterations are unknown, count a taken branch and a not taken
3685          branch per peeled loop.  Even if scalar loop iterations are known,
3686          vector iterations are not known since peeled prologue iterations are
3687          not known.  Hence guards remain the same.  */
3688       prologue_need_br_taken_cost = true;
3689       prologue_need_br_not_taken_cost = true;
3690     }
3691   else
3692     {
3693       peel_iters_prologue = npeel;
3694       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3695         /* If peeled iterations are known but number of scalar loop
3696            iterations are unknown, count a taken branch per peeled loop.  */
3697         prologue_need_br_taken_cost = true;
3698     }
3699
3700   bool epilogue_need_br_taken_cost = false;
3701   bool epilogue_need_br_not_taken_cost = false;
3702
3703   /* Calculate peel_iters_epilogue.  */
3704   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3705     /* We need to peel exactly one iteration for gaps.  */
3706     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3707   else if (npeel < 0)
3708     {
3709       /* If peeling for alignment is unknown, loop bound of main loop
3710          becomes unknown.  */
3711       peel_iters_epilogue = assumed_vf / 2;
3712       if (dump_enabled_p ())
3713         dump_printf (MSG_NOTE, "cost model: "
3714                      "epilogue peel iters set to vf/2 because "
3715                      "peeling for alignment is unknown.\n");
3716
3717       /* See the same reason above in peel_iters_prologue calculation.  */
3718       epilogue_need_br_taken_cost = true;
3719       epilogue_need_br_not_taken_cost = true;
3720     }
3721   else
3722     {
3723       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3724       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3725         /* If peeled iterations are known but number of scalar loop
3726            iterations are unknown, count a taken branch per peeled loop.  */
3727         epilogue_need_br_taken_cost = true;
3728     }
3729
3730   stmt_info_for_cost *si;
3731   int j;
3732   /* Add costs associated with peel_iters_prologue.  */
3733   if (peel_iters_prologue)
3734     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3735       {
3736         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3737                               si->count * peel_iters_prologue, si->kind,
3738                               si->stmt_info, si->vectype, si->misalign,
3739                               vect_prologue);
3740       }
3741
3742   /* Add costs associated with peel_iters_epilogue.  */
3743   if (peel_iters_epilogue)
3744     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3745       {
3746         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3747                               si->count * peel_iters_epilogue, si->kind,
3748                               si->stmt_info, si->vectype, si->misalign,
3749                               vect_epilogue);
3750       }
3751
3752   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3753
3754   if (prologue_need_br_taken_cost)
3755     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3756                           NULL, NULL_TREE, 0, vect_prologue);
3757
3758   if (prologue_need_br_not_taken_cost)
3759     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3760                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3761                           vect_prologue);
3762
3763   if (epilogue_need_br_taken_cost)
3764     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3765                           NULL, NULL_TREE, 0, vect_epilogue);
3766
3767   if (epilogue_need_br_not_taken_cost)
3768     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3769                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3770                           vect_epilogue);
3771
3772   /* Take care of special costs for rgroup controls of partial vectors.  */
3773   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3774     {
3775       /* Calculate how many masks we need to generate.  */
3776       unsigned int num_masks = 0;
3777       rgroup_controls *rgm;
3778       unsigned int num_vectors_m1;
3779       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3780         if (rgm->type)
3781           num_masks += num_vectors_m1 + 1;
3782       gcc_assert (num_masks > 0);
3783
3784       /* In the worst case, we need to generate each mask in the prologue
3785          and in the loop body.  One of the loop body mask instructions
3786          replaces the comparison in the scalar loop, and since we don't
3787          count the scalar comparison against the scalar body, we shouldn't
3788          count that vector instruction against the vector body either.
3789
3790          Sometimes we can use unpacks instead of generating prologue
3791          masks and sometimes the prologue mask will fold to a constant,
3792          so the actual prologue cost might be smaller.  However, it's
3793          simpler and safer to use the worst-case cost; if this ends up
3794          being the tie-breaker between vectorizing or not, then it's
3795          probably better not to vectorize.  */
3796       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3797                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3798       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3799                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
3800     }
3801   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3802     {
3803       /* Referring to the functions vect_set_loop_condition_partial_vectors
3804          and vect_set_loop_controls_directly, we need to generate each
3805          length in the prologue and in the loop body if required. Although
3806          there are some possible optimizations, we consider the worst case
3807          here.  */
3808
3809       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3810       bool need_iterate_p
3811         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3812            && !vect_known_niters_smaller_than_vf (loop_vinfo));
3813
3814       /* Calculate how many statements to be added.  */
3815       unsigned int prologue_stmts = 0;
3816       unsigned int body_stmts = 0;
3817
3818       rgroup_controls *rgc;
3819       unsigned int num_vectors_m1;
3820       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3821         if (rgc->type)
3822           {
3823             /* May need one SHIFT for nitems_total computation.  */
3824             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3825             if (nitems != 1 && !niters_known_p)
3826               prologue_stmts += 1;
3827
3828             /* May need one MAX and one MINUS for wrap around.  */
3829             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3830               prologue_stmts += 2;
3831
3832             /* Need one MAX and one MINUS for each batch limit excepting for
3833                the 1st one.  */
3834             prologue_stmts += num_vectors_m1 * 2;
3835
3836             unsigned int num_vectors = num_vectors_m1 + 1;
3837
3838             /* Need to set up lengths in prologue, only one MIN required
3839                for each since start index is zero.  */
3840             prologue_stmts += num_vectors;
3841
3842             /* Each may need two MINs and one MINUS to update lengths in body
3843                for next iteration.  */
3844             if (need_iterate_p)
3845               body_stmts += 3 * num_vectors;
3846           }
3847
3848       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3849                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3850       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3851                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3852     }
3853
3854   /* FORNOW: The scalar outside cost is incremented in one of the
3855      following ways:
3856
3857      1. The vectorizer checks for alignment and aliasing and generates
3858      a condition that allows dynamic vectorization.  A cost model
3859      check is ANDED with the versioning condition.  Hence scalar code
3860      path now has the added cost of the versioning check.
3861
3862        if (cost > th & versioning_check)
3863          jmp to vector code
3864
3865      Hence run-time scalar is incremented by not-taken branch cost.
3866
3867      2. The vectorizer then checks if a prologue is required.  If the
3868      cost model check was not done before during versioning, it has to
3869      be done before the prologue check.
3870
3871        if (cost <= th)
3872          prologue = scalar_iters
3873        if (prologue == 0)
3874          jmp to vector code
3875        else
3876          execute prologue
3877        if (prologue == num_iters)
3878          go to exit
3879
3880      Hence the run-time scalar cost is incremented by a taken branch,
3881      plus a not-taken branch, plus a taken branch cost.
3882
3883      3. The vectorizer then checks if an epilogue is required.  If the
3884      cost model check was not done before during prologue check, it
3885      has to be done with the epilogue check.
3886
3887        if (prologue == 0)
3888          jmp to vector code
3889        else
3890          execute prologue
3891        if (prologue == num_iters)
3892          go to exit
3893        vector code:
3894          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3895            jmp to epilogue
3896
3897      Hence the run-time scalar cost should be incremented by 2 taken
3898      branches.
3899
3900      TODO: The back end may reorder the BBS's differently and reverse
3901      conditions/branch directions.  Change the estimates below to
3902      something more reasonable.  */
3903
3904   /* If the number of iterations is known and we do not do versioning, we can
3905      decide whether to vectorize at compile time.  Hence the scalar version
3906      do not carry cost model guard costs.  */
3907   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3908       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3909     {
3910       /* Cost model check occurs at versioning.  */
3911       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3912         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3913       else
3914         {
3915           /* Cost model check occurs at prologue generation.  */
3916           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3917             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3918               + vect_get_stmt_cost (cond_branch_not_taken);
3919           /* Cost model check occurs at epilogue generation.  */
3920           else
3921             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3922         }
3923     }
3924
3925   /* Complete the target-specific cost calculations.  */
3926   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3927                &vec_inside_cost, &vec_epilogue_cost);
3928
3929   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3930
3931   /* Stash the costs so that we can compare two loop_vec_infos.  */
3932   loop_vinfo->vec_inside_cost = vec_inside_cost;
3933   loop_vinfo->vec_outside_cost = vec_outside_cost;
3934
3935   if (dump_enabled_p ())
3936     {
3937       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3938       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3939                    vec_inside_cost);
3940       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3941                    vec_prologue_cost);
3942       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3943                    vec_epilogue_cost);
3944       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3945                    scalar_single_iter_cost);
3946       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3947                    scalar_outside_cost);
3948       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3949                    vec_outside_cost);
3950       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3951                    peel_iters_prologue);
3952       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3953                    peel_iters_epilogue);
3954     }
3955
3956   /* Calculate number of iterations required to make the vector version
3957      profitable, relative to the loop bodies only.  The following condition
3958      must hold true:
3959      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3960      where
3961      SIC = scalar iteration cost, VIC = vector iteration cost,
3962      VOC = vector outside cost, VF = vectorization factor,
3963      NPEEL = prologue iterations + epilogue iterations,
3964      SOC = scalar outside cost for run time cost model check.  */
3965
3966   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3967                           - vec_inside_cost);
3968   if (saving_per_viter <= 0)
3969     {
3970       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3971         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3972                     "vectorization did not happen for a simd loop");
3973
3974       if (dump_enabled_p ())
3975         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976                          "cost model: the vector iteration cost = %d "
3977                          "divided by the scalar iteration cost = %d "
3978                          "is greater or equal to the vectorization factor = %d"
3979                          ".\n",
3980                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3981       *ret_min_profitable_niters = -1;
3982       *ret_min_profitable_estimate = -1;
3983       return;
3984     }
3985
3986   /* ??? The "if" arm is written to handle all cases; see below for what
3987      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
3988   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3989     {
3990       /* Rewriting the condition above in terms of the number of
3991          vector iterations (vniters) rather than the number of
3992          scalar iterations (niters) gives:
3993
3994          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3995
3996          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3997
3998          For integer N, X and Y when X > 0:
3999
4000          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4001       int outside_overhead = (vec_outside_cost
4002                               - scalar_single_iter_cost * peel_iters_prologue
4003                               - scalar_single_iter_cost * peel_iters_epilogue
4004                               - scalar_outside_cost);
4005       /* We're only interested in cases that require at least one
4006          vector iteration.  */
4007       int min_vec_niters = 1;
4008       if (outside_overhead > 0)
4009         min_vec_niters = outside_overhead / saving_per_viter + 1;
4010
4011       if (dump_enabled_p ())
4012         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4013                      min_vec_niters);
4014
4015       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4016         {
4017           /* Now that we know the minimum number of vector iterations,
4018              find the minimum niters for which the scalar cost is larger:
4019
4020              SIC * niters > VIC * vniters + VOC - SOC
4021
4022              We know that the minimum niters is no more than
4023              vniters * VF + NPEEL, but it might be (and often is) less
4024              than that if a partial vector iteration is cheaper than the
4025              equivalent scalar code.  */
4026           int threshold = (vec_inside_cost * min_vec_niters
4027                            + vec_outside_cost
4028                            - scalar_outside_cost);
4029           if (threshold <= 0)
4030             min_profitable_iters = 1;
4031           else
4032             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4033         }
4034       else
4035         /* Convert the number of vector iterations into a number of
4036            scalar iterations.  */
4037         min_profitable_iters = (min_vec_niters * assumed_vf
4038                                 + peel_iters_prologue
4039                                 + peel_iters_epilogue);
4040     }
4041   else
4042     {
4043       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4044                               * assumed_vf
4045                               - vec_inside_cost * peel_iters_prologue
4046                               - vec_inside_cost * peel_iters_epilogue);
4047       if (min_profitable_iters <= 0)
4048         min_profitable_iters = 0;
4049       else
4050         {
4051           min_profitable_iters /= saving_per_viter;
4052
4053           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4054               <= (((int) vec_inside_cost * min_profitable_iters)
4055                   + (((int) vec_outside_cost - scalar_outside_cost)
4056                      * assumed_vf)))
4057             min_profitable_iters++;
4058         }
4059     }
4060
4061   if (dump_enabled_p ())
4062     dump_printf (MSG_NOTE,
4063                  "  Calculated minimum iters for profitability: %d\n",
4064                  min_profitable_iters);
4065
4066   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4067       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4068     /* We want the vectorized loop to execute at least once.  */
4069     min_profitable_iters = assumed_vf + peel_iters_prologue;
4070   else if (min_profitable_iters < peel_iters_prologue)
4071     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4072        vectorized loop executes at least once.  */
4073     min_profitable_iters = peel_iters_prologue;
4074
4075   if (dump_enabled_p ())
4076     dump_printf_loc (MSG_NOTE, vect_location,
4077                      "  Runtime profitability threshold = %d\n",
4078                      min_profitable_iters);
4079
4080   *ret_min_profitable_niters = min_profitable_iters;
4081
4082   /* Calculate number of iterations required to make the vector version
4083      profitable, relative to the loop bodies only.
4084
4085      Non-vectorized variant is SIC * niters and it must win over vector
4086      variant on the expected loop trip count.  The following condition must hold true:
4087      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4088
4089   if (vec_outside_cost <= 0)
4090     min_profitable_estimate = 0;
4091   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4092     {
4093       /* This is a repeat of the code above, but with + SOC rather
4094          than - SOC.  */
4095       int outside_overhead = (vec_outside_cost
4096                               - scalar_single_iter_cost * peel_iters_prologue
4097                               - scalar_single_iter_cost * peel_iters_epilogue
4098                               + scalar_outside_cost);
4099       int min_vec_niters = 1;
4100       if (outside_overhead > 0)
4101         min_vec_niters = outside_overhead / saving_per_viter + 1;
4102
4103       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4104         {
4105           int threshold = (vec_inside_cost * min_vec_niters
4106                            + vec_outside_cost
4107                            + scalar_outside_cost);
4108           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4109         }
4110       else
4111         min_profitable_estimate = (min_vec_niters * assumed_vf
4112                                    + peel_iters_prologue
4113                                    + peel_iters_epilogue);
4114     }
4115   else
4116     {
4117       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4118                                  * assumed_vf
4119                                  - vec_inside_cost * peel_iters_prologue
4120                                  - vec_inside_cost * peel_iters_epilogue)
4121                                  / ((scalar_single_iter_cost * assumed_vf)
4122                                    - vec_inside_cost);
4123     }
4124   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4125   if (dump_enabled_p ())
4126     dump_printf_loc (MSG_NOTE, vect_location,
4127                      "  Static estimate profitability threshold = %d\n",
4128                      min_profitable_estimate);
4129
4130   *ret_min_profitable_estimate = min_profitable_estimate;
4131 }
4132
4133 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4134    vector elements (not bits) for a vector with NELT elements.  */
4135 static void
4136 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4137                               vec_perm_builder *sel)
4138 {
4139   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4140      by vec_perm_indices.  */
4141   sel->new_vector (nelt, 1, 3);
4142   for (unsigned int i = 0; i < 3; i++)
4143     sel->quick_push (i + offset);
4144 }
4145
4146 /* Checks whether the target supports whole-vector shifts for vectors of mode
4147    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4148    it supports vec_perm_const with masks for all necessary shift amounts.  */
4149 static bool
4150 have_whole_vector_shift (machine_mode mode)
4151 {
4152   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4153     return true;
4154
4155   /* Variable-length vectors should be handled via the optab.  */
4156   unsigned int nelt;
4157   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4158     return false;
4159
4160   vec_perm_builder sel;
4161   vec_perm_indices indices;
4162   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4163     {
4164       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4165       indices.new_vector (sel, 2, nelt);
4166       if (!can_vec_perm_const_p (mode, indices, false))
4167         return false;
4168     }
4169   return true;
4170 }
4171
4172 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4173    functions. Design better to avoid maintenance issues.  */
4174
4175 /* Function vect_model_reduction_cost.
4176
4177    Models cost for a reduction operation, including the vector ops
4178    generated within the strip-mine loop, the initial definition before
4179    the loop, and the epilogue code that must be generated.  */
4180
4181 static void
4182 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4183                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4184                            vect_reduction_type reduction_type,
4185                            int ncopies, stmt_vector_for_cost *cost_vec)
4186 {
4187   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4188   enum tree_code code;
4189   optab optab;
4190   tree vectype;
4191   machine_mode mode;
4192   class loop *loop = NULL;
4193
4194   if (loop_vinfo)
4195     loop = LOOP_VINFO_LOOP (loop_vinfo);
4196
4197   /* Condition reductions generate two reductions in the loop.  */
4198   if (reduction_type == COND_REDUCTION)
4199     ncopies *= 2;
4200
4201   vectype = STMT_VINFO_VECTYPE (stmt_info);
4202   mode = TYPE_MODE (vectype);
4203   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4204
4205   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4206
4207   if (reduction_type == EXTRACT_LAST_REDUCTION)
4208     /* No extra instructions are needed in the prologue.  The loop body
4209        operations are costed in vectorizable_condition.  */
4210     inside_cost = 0;
4211   else if (reduction_type == FOLD_LEFT_REDUCTION)
4212     {
4213       /* No extra instructions needed in the prologue.  */
4214       prologue_cost = 0;
4215
4216       if (reduc_fn != IFN_LAST)
4217         /* Count one reduction-like operation per vector.  */
4218         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4219                                         stmt_info, 0, vect_body);
4220       else
4221         {
4222           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4223           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4224           inside_cost = record_stmt_cost (cost_vec, nelements,
4225                                           vec_to_scalar, stmt_info, 0,
4226                                           vect_body);
4227           inside_cost += record_stmt_cost (cost_vec, nelements,
4228                                            scalar_stmt, stmt_info, 0,
4229                                            vect_body);
4230         }
4231     }
4232   else
4233     {
4234       /* Add in cost for initial definition.
4235          For cond reduction we have four vectors: initial index, step,
4236          initial result of the data reduction, initial value of the index
4237          reduction.  */
4238       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4239       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4240                                          scalar_to_vec, stmt_info, 0,
4241                                          vect_prologue);
4242
4243       /* Cost of reduction op inside loop.  */
4244       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4245                                       stmt_info, 0, vect_body);
4246     }
4247
4248   /* Determine cost of epilogue code.
4249
4250      We have a reduction operator that will reduce the vector in one statement.
4251      Also requires scalar extract.  */
4252
4253   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4254     {
4255       if (reduc_fn != IFN_LAST)
4256         {
4257           if (reduction_type == COND_REDUCTION)
4258             {
4259               /* An EQ stmt and an COND_EXPR stmt.  */
4260               epilogue_cost += record_stmt_cost (cost_vec, 2,
4261                                                  vector_stmt, stmt_info, 0,
4262                                                  vect_epilogue);
4263               /* Reduction of the max index and a reduction of the found
4264                  values.  */
4265               epilogue_cost += record_stmt_cost (cost_vec, 2,
4266                                                  vec_to_scalar, stmt_info, 0,
4267                                                  vect_epilogue);
4268               /* A broadcast of the max value.  */
4269               epilogue_cost += record_stmt_cost (cost_vec, 1,
4270                                                  scalar_to_vec, stmt_info, 0,
4271                                                  vect_epilogue);
4272             }
4273           else
4274             {
4275               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4276                                                  stmt_info, 0, vect_epilogue);
4277               epilogue_cost += record_stmt_cost (cost_vec, 1,
4278                                                  vec_to_scalar, stmt_info, 0,
4279                                                  vect_epilogue);
4280             }
4281         }
4282       else if (reduction_type == COND_REDUCTION)
4283         {
4284           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4285           /* Extraction of scalar elements.  */
4286           epilogue_cost += record_stmt_cost (cost_vec,
4287                                              2 * estimated_nunits,
4288                                              vec_to_scalar, stmt_info, 0,
4289                                              vect_epilogue);
4290           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4291           epilogue_cost += record_stmt_cost (cost_vec,
4292                                              2 * estimated_nunits - 3,
4293                                              scalar_stmt, stmt_info, 0,
4294                                              vect_epilogue);
4295         }
4296       else if (reduction_type == EXTRACT_LAST_REDUCTION
4297                || reduction_type == FOLD_LEFT_REDUCTION)
4298         /* No extra instructions need in the epilogue.  */
4299         ;
4300       else
4301         {
4302           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4303           tree bitsize =
4304             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4305           int element_bitsize = tree_to_uhwi (bitsize);
4306           int nelements = vec_size_in_bits / element_bitsize;
4307
4308           if (code == COND_EXPR)
4309             code = MAX_EXPR;
4310
4311           optab = optab_for_tree_code (code, vectype, optab_default);
4312
4313           /* We have a whole vector shift available.  */
4314           if (optab != unknown_optab
4315               && VECTOR_MODE_P (mode)
4316               && optab_handler (optab, mode) != CODE_FOR_nothing
4317               && have_whole_vector_shift (mode))
4318             {
4319               /* Final reduction via vector shifts and the reduction operator.
4320                  Also requires scalar extract.  */
4321               epilogue_cost += record_stmt_cost (cost_vec,
4322                                                  exact_log2 (nelements) * 2,
4323                                                  vector_stmt, stmt_info, 0,
4324                                                  vect_epilogue);
4325               epilogue_cost += record_stmt_cost (cost_vec, 1,
4326                                                  vec_to_scalar, stmt_info, 0,
4327                                                  vect_epilogue);
4328             }
4329           else
4330             /* Use extracts and reduction op for final reduction.  For N
4331                elements, we have N extracts and N-1 reduction ops.  */
4332             epilogue_cost += record_stmt_cost (cost_vec,
4333                                                nelements + nelements - 1,
4334                                                vector_stmt, stmt_info, 0,
4335                                                vect_epilogue);
4336         }
4337     }
4338
4339   if (dump_enabled_p ())
4340     dump_printf (MSG_NOTE,
4341                  "vect_model_reduction_cost: inside_cost = %d, "
4342                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4343                  prologue_cost, epilogue_cost);
4344 }
4345
4346
4347 /* Function vect_model_induction_cost.
4348
4349    Models cost for induction operations.  */
4350
4351 static void
4352 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4353                            stmt_vector_for_cost *cost_vec)
4354 {
4355   unsigned inside_cost, prologue_cost;
4356
4357   if (PURE_SLP_STMT (stmt_info))
4358     return;
4359
4360   /* loop cost for vec_loop.  */
4361   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4362                                   stmt_info, 0, vect_body);
4363
4364   /* prologue cost for vec_init and vec_step.  */
4365   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4366                                     stmt_info, 0, vect_prologue);
4367
4368   if (dump_enabled_p ())
4369     dump_printf_loc (MSG_NOTE, vect_location,
4370                      "vect_model_induction_cost: inside_cost = %d, "
4371                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4372 }
4373
4374
4375
4376 /* Function get_initial_def_for_reduction
4377
4378    Input:
4379    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4380    INIT_VAL - the initial value of the reduction variable
4381
4382    Output:
4383    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4384         of the reduction (used for adjusting the epilog - see below).
4385    Return a vector variable, initialized according to the operation that
4386         STMT_VINFO performs. This vector will be used as the initial value
4387         of the vector of partial results.
4388
4389    Option1 (adjust in epilog): Initialize the vector as follows:
4390      add/bit or/xor:    [0,0,...,0,0]
4391      mult/bit and:      [1,1,...,1,1]
4392      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4393    and when necessary (e.g. add/mult case) let the caller know
4394    that it needs to adjust the result by init_val.
4395
4396    Option2: Initialize the vector as follows:
4397      add/bit or/xor:    [init_val,0,0,...,0]
4398      mult/bit and:      [init_val,1,1,...,1]
4399      min/max/cond_expr: [init_val,init_val,...,init_val]
4400    and no adjustments are needed.
4401
4402    For example, for the following code:
4403
4404    s = init_val;
4405    for (i=0;i<n;i++)
4406      s = s + a[i];
4407
4408    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4409    For a vector of 4 units, we want to return either [0,0,0,init_val],
4410    or [0,0,0,0] and let the caller know that it needs to adjust
4411    the result at the end by 'init_val'.
4412
4413    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4414    initialization vector is simpler (same element in all entries), if
4415    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4416
4417    A cost model should help decide between these two schemes.  */
4418
4419 static tree
4420 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4421                                stmt_vec_info stmt_vinfo,
4422                                enum tree_code code, tree init_val,
4423                                tree *adjustment_def)
4424 {
4425   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4426   tree scalar_type = TREE_TYPE (init_val);
4427   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4428   tree def_for_init;
4429   tree init_def;
4430   REAL_VALUE_TYPE real_init_val = dconst0;
4431   int int_init_val = 0;
4432   gimple_seq stmts = NULL;
4433
4434   gcc_assert (vectype);
4435
4436   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4437               || SCALAR_FLOAT_TYPE_P (scalar_type));
4438
4439   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4440               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4441
4442   /* ADJUSTMENT_DEF is NULL when called from
4443      vect_create_epilog_for_reduction to vectorize double reduction.  */
4444   if (adjustment_def)
4445     *adjustment_def = NULL;
4446
4447   switch (code)
4448     {
4449     case WIDEN_SUM_EXPR:
4450     case DOT_PROD_EXPR:
4451     case SAD_EXPR:
4452     case PLUS_EXPR:
4453     case MINUS_EXPR:
4454     case BIT_IOR_EXPR:
4455     case BIT_XOR_EXPR:
4456     case MULT_EXPR:
4457     case BIT_AND_EXPR:
4458       {
4459         if (code == MULT_EXPR)
4460           {
4461             real_init_val = dconst1;
4462             int_init_val = 1;
4463           }
4464
4465         if (code == BIT_AND_EXPR)
4466           int_init_val = -1;
4467
4468         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4469           def_for_init = build_real (scalar_type, real_init_val);
4470         else
4471           def_for_init = build_int_cst (scalar_type, int_init_val);
4472
4473         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4474           {
4475             /* Option1: the first element is '0' or '1' as well.  */
4476             if (!operand_equal_p (def_for_init, init_val, 0))
4477               *adjustment_def = init_val;
4478             init_def = gimple_build_vector_from_val (&stmts, vectype,
4479                                                      def_for_init);
4480           }
4481         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4482           {
4483             /* Option2 (variable length): the first element is INIT_VAL.  */
4484             init_def = gimple_build_vector_from_val (&stmts, vectype,
4485                                                      def_for_init);
4486             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4487                                      vectype, init_def, init_val);
4488           }
4489         else
4490           {
4491             /* Option2: the first element is INIT_VAL.  */
4492             tree_vector_builder elts (vectype, 1, 2);
4493             elts.quick_push (init_val);
4494             elts.quick_push (def_for_init);
4495             init_def = gimple_build_vector (&stmts, &elts);
4496           }
4497       }
4498       break;
4499
4500     case MIN_EXPR:
4501     case MAX_EXPR:
4502     case COND_EXPR:
4503       {
4504         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4505         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4506       }
4507       break;
4508
4509     default:
4510       gcc_unreachable ();
4511     }
4512
4513   if (stmts)
4514     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4515   return init_def;
4516 }
4517
4518 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4519    NUMBER_OF_VECTORS is the number of vector defs to create.
4520    If NEUTRAL_OP is nonnull, introducing extra elements of that
4521    value will not change the result.  */
4522
4523 static void
4524 get_initial_defs_for_reduction (vec_info *vinfo,
4525                                 slp_tree slp_node,
4526                                 vec<tree> *vec_oprnds,
4527                                 unsigned int number_of_vectors,
4528                                 bool reduc_chain, tree neutral_op)
4529 {
4530   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4531   stmt_vec_info stmt_vinfo = stmts[0];
4532   unsigned HOST_WIDE_INT nunits;
4533   unsigned j, number_of_places_left_in_vector;
4534   tree vector_type;
4535   unsigned int group_size = stmts.length ();
4536   unsigned int i;
4537   class loop *loop;
4538
4539   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4540
4541   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4542
4543   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4544   gcc_assert (loop);
4545   edge pe = loop_preheader_edge (loop);
4546
4547   gcc_assert (!reduc_chain || neutral_op);
4548
4549   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4550      created vectors. It is greater than 1 if unrolling is performed.
4551
4552      For example, we have two scalar operands, s1 and s2 (e.g., group of
4553      strided accesses of size two), while NUNITS is four (i.e., four scalars
4554      of this type can be packed in a vector).  The output vector will contain
4555      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4556      will be 2).
4557
4558      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4559      vectors containing the operands.
4560
4561      For example, NUNITS is four as before, and the group size is 8
4562      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4563      {s5, s6, s7, s8}.  */
4564
4565   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4566     nunits = group_size;
4567
4568   number_of_places_left_in_vector = nunits;
4569   bool constant_p = true;
4570   tree_vector_builder elts (vector_type, nunits, 1);
4571   elts.quick_grow (nunits);
4572   gimple_seq ctor_seq = NULL;
4573   for (j = 0; j < nunits * number_of_vectors; ++j)
4574     {
4575       tree op;
4576       i = j % group_size;
4577       stmt_vinfo = stmts[i];
4578
4579       /* Get the def before the loop.  In reduction chain we have only
4580          one initial value.  Else we have as many as PHIs in the group.  */
4581       if (reduc_chain)
4582         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4583       else if (((vec_oprnds->length () + 1) * nunits
4584                 - number_of_places_left_in_vector >= group_size)
4585                && neutral_op)
4586         op = neutral_op;
4587       else
4588         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4589
4590       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4591       number_of_places_left_in_vector--;
4592       elts[nunits - number_of_places_left_in_vector - 1] = op;
4593       if (!CONSTANT_CLASS_P (op))
4594         constant_p = false;
4595
4596       if (number_of_places_left_in_vector == 0)
4597         {
4598           tree init;
4599           if (constant_p && !neutral_op
4600               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4601               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4602             /* Build the vector directly from ELTS.  */
4603             init = gimple_build_vector (&ctor_seq, &elts);
4604           else if (neutral_op)
4605             {
4606               /* Build a vector of the neutral value and shift the
4607                  other elements into place.  */
4608               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4609                                                    neutral_op);
4610               int k = nunits;
4611               while (k > 0 && elts[k - 1] == neutral_op)
4612                 k -= 1;
4613               while (k > 0)
4614                 {
4615                   k -= 1;
4616                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4617                                        vector_type, init, elts[k]);
4618                 }
4619             }
4620           else
4621             {
4622               /* First time round, duplicate ELTS to fill the
4623                  required number of vectors.  */
4624               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4625                                         number_of_vectors, *vec_oprnds);
4626               break;
4627             }
4628           vec_oprnds->quick_push (init);
4629
4630           number_of_places_left_in_vector = nunits;
4631           elts.new_vector (vector_type, nunits, 1);
4632           elts.quick_grow (nunits);
4633           constant_p = true;
4634         }
4635     }
4636   if (ctor_seq != NULL)
4637     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4638 }
4639
4640 /* For a statement STMT_INFO taking part in a reduction operation return
4641    the stmt_vec_info the meta information is stored on.  */
4642
4643 stmt_vec_info
4644 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4645 {
4646   stmt_info = vect_orig_stmt (stmt_info);
4647   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4648   if (!is_a <gphi *> (stmt_info->stmt))
4649     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4650   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4651   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4652     {
4653       if (gimple_phi_num_args (phi) == 1)
4654         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4655     }
4656   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4657     {
4658       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4659       stmt_vec_info info
4660           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4661       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4662         stmt_info = info;
4663     }
4664   return stmt_info;
4665 }
4666
4667 /* Function vect_create_epilog_for_reduction
4668
4669    Create code at the loop-epilog to finalize the result of a reduction
4670    computation.
4671
4672    STMT_INFO is the scalar reduction stmt that is being vectorized.
4673    SLP_NODE is an SLP node containing a group of reduction statements. The
4674      first one in this group is STMT_INFO.
4675    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4676    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4677      (counting from 0)
4678
4679    This function:
4680    1. Completes the reduction def-use cycles.
4681    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4682       by calling the function specified by REDUC_FN if available, or by
4683       other means (whole-vector shifts or a scalar loop).
4684       The function also creates a new phi node at the loop exit to preserve
4685       loop-closed form, as illustrated below.
4686
4687      The flow at the entry to this function:
4688
4689         loop:
4690           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4691           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4692           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4693         loop_exit:
4694           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4695           use <s_out0>
4696           use <s_out0>
4697
4698      The above is transformed by this function into:
4699
4700         loop:
4701           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4702           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4703           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4704         loop_exit:
4705           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4706           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4707           v_out2 = reduce <v_out1>
4708           s_out3 = extract_field <v_out2, 0>
4709           s_out4 = adjust_result <s_out3>
4710           use <s_out4>
4711           use <s_out4>
4712 */
4713
4714 static void
4715 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4716                                   stmt_vec_info stmt_info,
4717                                   slp_tree slp_node,
4718                                   slp_instance slp_node_instance)
4719 {
4720   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4721   gcc_assert (reduc_info->is_reduc_info);
4722   /* For double reductions we need to get at the inner loop reduction
4723      stmt which has the meta info attached.  Our stmt_info is that of the
4724      loop-closed PHI of the inner loop which we remember as
4725      def for the reduction PHI generation.  */
4726   bool double_reduc = false;
4727   stmt_vec_info rdef_info = stmt_info;
4728   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4729     {
4730       gcc_assert (!slp_node);
4731       double_reduc = true;
4732       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4733                                             (stmt_info->stmt, 0));
4734       stmt_info = vect_stmt_to_vectorize (stmt_info);
4735     }
4736   gphi *reduc_def_stmt
4737     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4738   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4739   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4740   tree vectype;
4741   machine_mode mode;
4742   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4743   basic_block exit_bb;
4744   tree scalar_dest;
4745   tree scalar_type;
4746   gimple *new_phi = NULL, *phi;
4747   gimple_stmt_iterator exit_gsi;
4748   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4749   gimple *epilog_stmt = NULL;
4750   gimple *exit_phi;
4751   tree bitsize;
4752   tree def;
4753   tree orig_name, scalar_result;
4754   imm_use_iterator imm_iter, phi_imm_iter;
4755   use_operand_p use_p, phi_use_p;
4756   gimple *use_stmt;
4757   bool nested_in_vect_loop = false;
4758   auto_vec<gimple *> new_phis;
4759   int j, i;
4760   auto_vec<tree> scalar_results;
4761   unsigned int group_size = 1, k;
4762   auto_vec<gimple *> phis;
4763   bool slp_reduc = false;
4764   bool direct_slp_reduc;
4765   tree new_phi_result;
4766   tree induction_index = NULL_TREE;
4767
4768   if (slp_node)
4769     group_size = SLP_TREE_LANES (slp_node);
4770
4771   if (nested_in_vect_loop_p (loop, stmt_info))
4772     {
4773       outer_loop = loop;
4774       loop = loop->inner;
4775       nested_in_vect_loop = true;
4776       gcc_assert (!slp_node);
4777     }
4778   gcc_assert (!nested_in_vect_loop || double_reduc);
4779
4780   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4781   gcc_assert (vectype);
4782   mode = TYPE_MODE (vectype);
4783
4784   tree initial_def = NULL;
4785   tree induc_val = NULL_TREE;
4786   tree adjustment_def = NULL;
4787   if (slp_node)
4788     ;
4789   else
4790     {
4791       /* Get at the scalar def before the loop, that defines the initial value
4792          of the reduction variable.  */
4793       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4794                                            loop_preheader_edge (loop));
4795       /* Optimize: for induction condition reduction, if we can't use zero
4796          for induc_val, use initial_def.  */
4797       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4798         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4799       else if (double_reduc)
4800         ;
4801       else if (nested_in_vect_loop)
4802         ;
4803       else
4804         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4805     }
4806
4807   unsigned vec_num;
4808   int ncopies;
4809   if (slp_node)
4810     {
4811       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4812       ncopies = 1;
4813     }
4814   else
4815     {
4816       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4817       vec_num = 1;
4818       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4819     }
4820
4821   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4822      which is updated with the current index of the loop for every match of
4823      the original loop's cond_expr (VEC_STMT).  This results in a vector
4824      containing the last time the condition passed for that vector lane.
4825      The first match will be a 1 to allow 0 to be used for non-matching
4826      indexes.  If there are no matches at all then the vector will be all
4827      zeroes.
4828
4829      PR92772: This algorithm is broken for architectures that support
4830      masked vectors, but do not provide fold_extract_last.  */
4831   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4832     {
4833       auto_vec<std::pair<tree, bool>, 2> ccompares;
4834       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4835       cond_info = vect_stmt_to_vectorize (cond_info);
4836       while (cond_info != reduc_info)
4837         {
4838           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4839             {
4840               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4841               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4842               ccompares.safe_push
4843                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4844                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4845             }
4846           cond_info
4847             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4848                                                  1 + STMT_VINFO_REDUC_IDX
4849                                                         (cond_info)));
4850           cond_info = vect_stmt_to_vectorize (cond_info);
4851         }
4852       gcc_assert (ccompares.length () != 0);
4853
4854       tree indx_before_incr, indx_after_incr;
4855       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4856       int scalar_precision
4857         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4858       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4859       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4860         (TYPE_MODE (vectype), cr_index_scalar_type,
4861          TYPE_VECTOR_SUBPARTS (vectype));
4862
4863       /* First we create a simple vector induction variable which starts
4864          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4865          vector size (STEP).  */
4866
4867       /* Create a {1,2,3,...} vector.  */
4868       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4869
4870       /* Create a vector of the step value.  */
4871       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4872       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4873
4874       /* Create an induction variable.  */
4875       gimple_stmt_iterator incr_gsi;
4876       bool insert_after;
4877       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4878       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4879                  insert_after, &indx_before_incr, &indx_after_incr);
4880
4881       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4882          filled with zeros (VEC_ZERO).  */
4883
4884       /* Create a vector of 0s.  */
4885       tree zero = build_zero_cst (cr_index_scalar_type);
4886       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4887
4888       /* Create a vector phi node.  */
4889       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4890       new_phi = create_phi_node (new_phi_tree, loop->header);
4891       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4892                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4893
4894       /* Now take the condition from the loops original cond_exprs
4895          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4896          every match uses values from the induction variable
4897          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4898          (NEW_PHI_TREE).
4899          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4900          the new cond_expr (INDEX_COND_EXPR).  */
4901       gimple_seq stmts = NULL;
4902       for (int i = ccompares.length () - 1; i != -1; --i)
4903         {
4904           tree ccompare = ccompares[i].first;
4905           if (ccompares[i].second)
4906             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4907                                          cr_index_vector_type,
4908                                          ccompare,
4909                                          indx_before_incr, new_phi_tree);
4910           else
4911             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4912                                          cr_index_vector_type,
4913                                          ccompare,
4914                                          new_phi_tree, indx_before_incr);
4915         }
4916       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4917
4918       /* Update the phi with the vec cond.  */
4919       induction_index = new_phi_tree;
4920       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4921                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4922     }
4923
4924   /* 2. Create epilog code.
4925         The reduction epilog code operates across the elements of the vector
4926         of partial results computed by the vectorized loop.
4927         The reduction epilog code consists of:
4928
4929         step 1: compute the scalar result in a vector (v_out2)
4930         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4931         step 3: adjust the scalar result (s_out3) if needed.
4932
4933         Step 1 can be accomplished using one the following three schemes:
4934           (scheme 1) using reduc_fn, if available.
4935           (scheme 2) using whole-vector shifts, if available.
4936           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4937                      combined.
4938
4939           The overall epilog code looks like this:
4940
4941           s_out0 = phi <s_loop>         # original EXIT_PHI
4942           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4943           v_out2 = reduce <v_out1>              # step 1
4944           s_out3 = extract_field <v_out2, 0>    # step 2
4945           s_out4 = adjust_result <s_out3>       # step 3
4946
4947           (step 3 is optional, and steps 1 and 2 may be combined).
4948           Lastly, the uses of s_out0 are replaced by s_out4.  */
4949
4950
4951   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4952          v_out1 = phi <VECT_DEF>
4953          Store them in NEW_PHIS.  */
4954   if (double_reduc)
4955     loop = outer_loop;
4956   exit_bb = single_exit (loop)->dest;
4957   new_phis.create (slp_node ? vec_num : ncopies);
4958   for (unsigned i = 0; i < vec_num; i++)
4959     {
4960       if (slp_node)
4961         def = vect_get_slp_vect_def (slp_node, i);
4962       else
4963         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4964       for (j = 0; j < ncopies; j++)
4965         {
4966           tree new_def = copy_ssa_name (def);
4967           phi = create_phi_node (new_def, exit_bb);
4968           if (j == 0)
4969             new_phis.quick_push (phi);
4970           else
4971             {
4972               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4973               new_phis.quick_push (phi);
4974             }
4975
4976           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4977         }
4978     }
4979
4980   exit_gsi = gsi_after_labels (exit_bb);
4981
4982   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4983          (i.e. when reduc_fn is not available) and in the final adjustment
4984          code (if needed).  Also get the original scalar reduction variable as
4985          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4986          represents a reduction pattern), the tree-code and scalar-def are
4987          taken from the original stmt that the pattern-stmt (STMT) replaces.
4988          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4989          are taken from STMT.  */
4990
4991   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4992   if (orig_stmt_info != stmt_info)
4993     {
4994       /* Reduction pattern  */
4995       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4996       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4997     }
4998
4999   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5000   scalar_type = TREE_TYPE (scalar_dest);
5001   scalar_results.create (group_size);
5002   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5003   bitsize = TYPE_SIZE (scalar_type);
5004
5005   /* SLP reduction without reduction chain, e.g.,
5006      # a1 = phi <a2, a0>
5007      # b1 = phi <b2, b0>
5008      a2 = operation (a1)
5009      b2 = operation (b1)  */
5010   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5011
5012   /* True if we should implement SLP_REDUC using native reduction operations
5013      instead of scalar operations.  */
5014   direct_slp_reduc = (reduc_fn != IFN_LAST
5015                       && slp_reduc
5016                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5017
5018   /* In case of reduction chain, e.g.,
5019      # a1 = phi <a3, a0>
5020      a2 = operation (a1)
5021      a3 = operation (a2),
5022
5023      we may end up with more than one vector result.  Here we reduce them to
5024      one vector.  */
5025   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5026     {
5027       gimple_seq stmts = NULL;
5028       tree first_vect = PHI_RESULT (new_phis[0]);
5029       first_vect = gimple_convert (&stmts, vectype, first_vect);
5030       for (k = 1; k < new_phis.length (); k++)
5031         {
5032           gimple *next_phi = new_phis[k];
5033           tree second_vect = PHI_RESULT (next_phi);
5034           second_vect = gimple_convert (&stmts, vectype, second_vect);
5035           first_vect = gimple_build (&stmts, code, vectype,
5036                                      first_vect, second_vect);
5037         }
5038       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5039
5040       new_phi_result = first_vect;
5041       new_phis.truncate (0);
5042       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5043     }
5044   /* Likewise if we couldn't use a single defuse cycle.  */
5045   else if (ncopies > 1)
5046     {
5047       gimple_seq stmts = NULL;
5048       tree first_vect = PHI_RESULT (new_phis[0]);
5049       first_vect = gimple_convert (&stmts, vectype, first_vect);
5050       for (int k = 1; k < ncopies; ++k)
5051         {
5052           tree second_vect = PHI_RESULT (new_phis[k]);
5053           second_vect = gimple_convert (&stmts, vectype, second_vect);
5054           first_vect = gimple_build (&stmts, code, vectype,
5055                                      first_vect, second_vect);
5056         }
5057       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5058       new_phi_result = first_vect;
5059       new_phis.truncate (0);
5060       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5061     }
5062   else
5063     new_phi_result = PHI_RESULT (new_phis[0]);
5064
5065   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5066       && reduc_fn != IFN_LAST)
5067     {
5068       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5069          various data values where the condition matched and another vector
5070          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5071          need to extract the last matching index (which will be the index with
5072          highest value) and use this to index into the data vector.
5073          For the case where there were no matches, the data vector will contain
5074          all default values and the index vector will be all zeros.  */
5075
5076       /* Get various versions of the type of the vector of indexes.  */
5077       tree index_vec_type = TREE_TYPE (induction_index);
5078       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5079       tree index_scalar_type = TREE_TYPE (index_vec_type);
5080       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5081
5082       /* Get an unsigned integer version of the type of the data vector.  */
5083       int scalar_precision
5084         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5085       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5086       tree vectype_unsigned = build_vector_type
5087         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5088
5089       /* First we need to create a vector (ZERO_VEC) of zeros and another
5090          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5091          can create using a MAX reduction and then expanding.
5092          In the case where the loop never made any matches, the max index will
5093          be zero.  */
5094
5095       /* Vector of {0, 0, 0,...}.  */
5096       tree zero_vec = build_zero_cst (vectype);
5097
5098       gimple_seq stmts = NULL;
5099       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5100       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5101
5102       /* Find maximum value from the vector of found indexes.  */
5103       tree max_index = make_ssa_name (index_scalar_type);
5104       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5105                                                           1, induction_index);
5106       gimple_call_set_lhs (max_index_stmt, max_index);
5107       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5108
5109       /* Vector of {max_index, max_index, max_index,...}.  */
5110       tree max_index_vec = make_ssa_name (index_vec_type);
5111       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5112                                                       max_index);
5113       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5114                                                         max_index_vec_rhs);
5115       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5116
5117       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5118          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5119          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5120          otherwise.  Only one value should match, resulting in a vector
5121          (VEC_COND) with one data value and the rest zeros.
5122          In the case where the loop never made any matches, every index will
5123          match, resulting in a vector with all data values (which will all be
5124          the default value).  */
5125
5126       /* Compare the max index vector to the vector of found indexes to find
5127          the position of the max value.  */
5128       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5129       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5130                                                       induction_index,
5131                                                       max_index_vec);
5132       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5133
5134       /* Use the compare to choose either values from the data vector or
5135          zero.  */
5136       tree vec_cond = make_ssa_name (vectype);
5137       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5138                                                    vec_compare, new_phi_result,
5139                                                    zero_vec);
5140       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5141
5142       /* Finally we need to extract the data value from the vector (VEC_COND)
5143          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5144          reduction, but because this doesn't exist, we can use a MAX reduction
5145          instead.  The data value might be signed or a float so we need to cast
5146          it first.
5147          In the case where the loop never made any matches, the data values are
5148          all identical, and so will reduce down correctly.  */
5149
5150       /* Make the matched data values unsigned.  */
5151       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5152       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5153                                        vec_cond);
5154       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5155                                                         VIEW_CONVERT_EXPR,
5156                                                         vec_cond_cast_rhs);
5157       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5158
5159       /* Reduce down to a scalar value.  */
5160       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5161       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5162                                                            1, vec_cond_cast);
5163       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5164       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5165
5166       /* Convert the reduced value back to the result type and set as the
5167          result.  */
5168       stmts = NULL;
5169       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5170                                data_reduc);
5171       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5172       scalar_results.safe_push (new_temp);
5173     }
5174   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5175            && reduc_fn == IFN_LAST)
5176     {
5177       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5178          idx = 0;
5179          idx_val = induction_index[0];
5180          val = data_reduc[0];
5181          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5182            if (induction_index[i] > idx_val)
5183              val = data_reduc[i], idx_val = induction_index[i];
5184          return val;  */
5185
5186       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5187       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5188       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5189       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5190       /* Enforced by vectorizable_reduction, which ensures we have target
5191          support before allowing a conditional reduction on variable-length
5192          vectors.  */
5193       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5194       tree idx_val = NULL_TREE, val = NULL_TREE;
5195       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5196         {
5197           tree old_idx_val = idx_val;
5198           tree old_val = val;
5199           idx_val = make_ssa_name (idx_eltype);
5200           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5201                                              build3 (BIT_FIELD_REF, idx_eltype,
5202                                                      induction_index,
5203                                                      bitsize_int (el_size),
5204                                                      bitsize_int (off)));
5205           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5206           val = make_ssa_name (data_eltype);
5207           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5208                                              build3 (BIT_FIELD_REF,
5209                                                      data_eltype,
5210                                                      new_phi_result,
5211                                                      bitsize_int (el_size),
5212                                                      bitsize_int (off)));
5213           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5214           if (off != 0)
5215             {
5216               tree new_idx_val = idx_val;
5217               if (off != v_size - el_size)
5218                 {
5219                   new_idx_val = make_ssa_name (idx_eltype);
5220                   epilog_stmt = gimple_build_assign (new_idx_val,
5221                                                      MAX_EXPR, idx_val,
5222                                                      old_idx_val);
5223                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224                 }
5225               tree new_val = make_ssa_name (data_eltype);
5226               epilog_stmt = gimple_build_assign (new_val,
5227                                                  COND_EXPR,
5228                                                  build2 (GT_EXPR,
5229                                                          boolean_type_node,
5230                                                          idx_val,
5231                                                          old_idx_val),
5232                                                  val, old_val);
5233               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234               idx_val = new_idx_val;
5235               val = new_val;
5236             }
5237         }
5238       /* Convert the reduced value back to the result type and set as the
5239          result.  */
5240       gimple_seq stmts = NULL;
5241       val = gimple_convert (&stmts, scalar_type, val);
5242       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5243       scalar_results.safe_push (val);
5244     }
5245
5246   /* 2.3 Create the reduction code, using one of the three schemes described
5247          above. In SLP we simply need to extract all the elements from the
5248          vector (without reducing them), so we use scalar shifts.  */
5249   else if (reduc_fn != IFN_LAST && !slp_reduc)
5250     {
5251       tree tmp;
5252       tree vec_elem_type;
5253
5254       /* Case 1:  Create:
5255          v_out2 = reduc_expr <v_out1>  */
5256
5257       if (dump_enabled_p ())
5258         dump_printf_loc (MSG_NOTE, vect_location,
5259                          "Reduce using direct vector reduction.\n");
5260
5261       gimple_seq stmts = NULL;
5262       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5263       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5264       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5265                                vec_elem_type, new_phi_result);
5266       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5267       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5268
5269       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5270           && induc_val)
5271         {
5272           /* Earlier we set the initial value to be a vector if induc_val
5273              values.  Check the result and if it is induc_val then replace
5274              with the original initial value, unless induc_val is
5275              the same as initial_def already.  */
5276           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5277                                   induc_val);
5278
5279           tmp = make_ssa_name (new_scalar_dest);
5280           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5281                                              initial_def, new_temp);
5282           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283           new_temp = tmp;
5284         }
5285
5286       scalar_results.safe_push (new_temp);
5287     }
5288   else if (direct_slp_reduc)
5289     {
5290       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5291          with the elements for other SLP statements replaced with the
5292          neutral value.  We can then do a normal reduction on each vector.  */
5293
5294       /* Enforced by vectorizable_reduction.  */
5295       gcc_assert (new_phis.length () == 1);
5296       gcc_assert (pow2p_hwi (group_size));
5297
5298       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5299       vec<stmt_vec_info> orig_phis
5300         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5301       gimple_seq seq = NULL;
5302
5303       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5304          and the same element size as VECTYPE.  */
5305       tree index = build_index_vector (vectype, 0, 1);
5306       tree index_type = TREE_TYPE (index);
5307       tree index_elt_type = TREE_TYPE (index_type);
5308       tree mask_type = truth_type_for (index_type);
5309
5310       /* Create a vector that, for each element, identifies which of
5311          the REDUC_GROUP_SIZE results should use it.  */
5312       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5313       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5314                             build_vector_from_val (index_type, index_mask));
5315
5316       /* Get a neutral vector value.  This is simply a splat of the neutral
5317          scalar value if we have one, otherwise the initial scalar value
5318          is itself a neutral value.  */
5319       tree vector_identity = NULL_TREE;
5320       tree neutral_op = NULL_TREE;
5321       if (slp_node)
5322         {
5323           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5324           neutral_op
5325             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5326                                             vectype, code, first != NULL);
5327         }
5328       if (neutral_op)
5329         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5330                                                         neutral_op);
5331       for (unsigned int i = 0; i < group_size; ++i)
5332         {
5333           /* If there's no univeral neutral value, we can use the
5334              initial scalar value from the original PHI.  This is used
5335              for MIN and MAX reduction, for example.  */
5336           if (!neutral_op)
5337             {
5338               tree scalar_value
5339                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5340                                          loop_preheader_edge (loop));
5341               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5342                                              scalar_value);
5343               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5344                                                               scalar_value);
5345             }
5346
5347           /* Calculate the equivalent of:
5348
5349              sel[j] = (index[j] == i);
5350
5351              which selects the elements of NEW_PHI_RESULT that should
5352              be included in the result.  */
5353           tree compare_val = build_int_cst (index_elt_type, i);
5354           compare_val = build_vector_from_val (index_type, compare_val);
5355           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5356                                    index, compare_val);
5357
5358           /* Calculate the equivalent of:
5359
5360              vec = seq ? new_phi_result : vector_identity;
5361
5362              VEC is now suitable for a full vector reduction.  */
5363           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5364                                    sel, new_phi_result, vector_identity);
5365
5366           /* Do the reduction and convert it to the appropriate type.  */
5367           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5368                                       TREE_TYPE (vectype), vec);
5369           scalar = gimple_convert (&seq, scalar_type, scalar);
5370           scalar_results.safe_push (scalar);
5371         }
5372       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5373     }
5374   else
5375     {
5376       bool reduce_with_shift;
5377       tree vec_temp;
5378
5379       gcc_assert (slp_reduc || new_phis.length () == 1);
5380
5381       /* See if the target wants to do the final (shift) reduction
5382          in a vector mode of smaller size and first reduce upper/lower
5383          halves against each other.  */
5384       enum machine_mode mode1 = mode;
5385       tree stype = TREE_TYPE (vectype);
5386       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5387       unsigned nunits1 = nunits;
5388       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5389           && new_phis.length () == 1)
5390         {
5391           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5392           /* For SLP reductions we have to make sure lanes match up, but
5393              since we're doing individual element final reduction reducing
5394              vector width here is even more important.
5395              ???  We can also separate lanes with permutes, for the common
5396              case of power-of-two group-size odd/even extracts would work.  */
5397           if (slp_reduc && nunits != nunits1)
5398             {
5399               nunits1 = least_common_multiple (nunits1, group_size);
5400               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5401             }
5402         }
5403       if (!slp_reduc
5404           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5405         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5406
5407       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5408                                                            stype, nunits1);
5409       reduce_with_shift = have_whole_vector_shift (mode1);
5410       if (!VECTOR_MODE_P (mode1))
5411         reduce_with_shift = false;
5412       else
5413         {
5414           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5415           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5416             reduce_with_shift = false;
5417         }
5418
5419       /* First reduce the vector to the desired vector size we should
5420          do shift reduction on by combining upper and lower halves.  */
5421       new_temp = new_phi_result;
5422       while (nunits > nunits1)
5423         {
5424           nunits /= 2;
5425           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5426                                                           stype, nunits);
5427           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5428
5429           /* The target has to make sure we support lowpart/highpart
5430              extraction, either via direct vector extract or through
5431              an integer mode punning.  */
5432           tree dst1, dst2;
5433           if (convert_optab_handler (vec_extract_optab,
5434                                      TYPE_MODE (TREE_TYPE (new_temp)),
5435                                      TYPE_MODE (vectype1))
5436               != CODE_FOR_nothing)
5437             {
5438               /* Extract sub-vectors directly once vec_extract becomes
5439                  a conversion optab.  */
5440               dst1 = make_ssa_name (vectype1);
5441               epilog_stmt
5442                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5443                                          build3 (BIT_FIELD_REF, vectype1,
5444                                                  new_temp, TYPE_SIZE (vectype1),
5445                                                  bitsize_int (0)));
5446               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5447               dst2 =  make_ssa_name (vectype1);
5448               epilog_stmt
5449                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5450                                          build3 (BIT_FIELD_REF, vectype1,
5451                                                  new_temp, TYPE_SIZE (vectype1),
5452                                                  bitsize_int (bitsize)));
5453               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5454             }
5455           else
5456             {
5457               /* Extract via punning to appropriately sized integer mode
5458                  vector.  */
5459               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5460               tree etype = build_vector_type (eltype, 2);
5461               gcc_assert (convert_optab_handler (vec_extract_optab,
5462                                                  TYPE_MODE (etype),
5463                                                  TYPE_MODE (eltype))
5464                           != CODE_FOR_nothing);
5465               tree tem = make_ssa_name (etype);
5466               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5467                                                  build1 (VIEW_CONVERT_EXPR,
5468                                                          etype, new_temp));
5469               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5470               new_temp = tem;
5471               tem = make_ssa_name (eltype);
5472               epilog_stmt
5473                   = gimple_build_assign (tem, BIT_FIELD_REF,
5474                                          build3 (BIT_FIELD_REF, eltype,
5475                                                  new_temp, TYPE_SIZE (eltype),
5476                                                  bitsize_int (0)));
5477               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5478               dst1 = make_ssa_name (vectype1);
5479               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5480                                                  build1 (VIEW_CONVERT_EXPR,
5481                                                          vectype1, tem));
5482               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5483               tem = make_ssa_name (eltype);
5484               epilog_stmt
5485                   = gimple_build_assign (tem, BIT_FIELD_REF,
5486                                          build3 (BIT_FIELD_REF, eltype,
5487                                                  new_temp, TYPE_SIZE (eltype),
5488                                                  bitsize_int (bitsize)));
5489               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5490               dst2 =  make_ssa_name (vectype1);
5491               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5492                                                  build1 (VIEW_CONVERT_EXPR,
5493                                                          vectype1, tem));
5494               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5495             }
5496
5497           new_temp = make_ssa_name (vectype1);
5498           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5499           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5500           new_phis[0] = epilog_stmt;
5501         }
5502
5503       if (reduce_with_shift && !slp_reduc)
5504         {
5505           int element_bitsize = tree_to_uhwi (bitsize);
5506           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5507              for variable-length vectors and also requires direct target support
5508              for loop reductions.  */
5509           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5510           int nelements = vec_size_in_bits / element_bitsize;
5511           vec_perm_builder sel;
5512           vec_perm_indices indices;
5513
5514           int elt_offset;
5515
5516           tree zero_vec = build_zero_cst (vectype1);
5517           /* Case 2: Create:
5518              for (offset = nelements/2; offset >= 1; offset/=2)
5519                 {
5520                   Create:  va' = vec_shift <va, offset>
5521                   Create:  va = vop <va, va'>
5522                 }  */
5523
5524           tree rhs;
5525
5526           if (dump_enabled_p ())
5527             dump_printf_loc (MSG_NOTE, vect_location,
5528                              "Reduce using vector shifts\n");
5529
5530           gimple_seq stmts = NULL;
5531           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5532           for (elt_offset = nelements / 2;
5533                elt_offset >= 1;
5534                elt_offset /= 2)
5535             {
5536               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5537               indices.new_vector (sel, 2, nelements);
5538               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5539               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5540                                        new_temp, zero_vec, mask);
5541               new_temp = gimple_build (&stmts, code,
5542                                        vectype1, new_name, new_temp);
5543             }
5544           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5545
5546           /* 2.4  Extract the final scalar result.  Create:
5547              s_out3 = extract_field <v_out2, bitpos>  */
5548
5549           if (dump_enabled_p ())
5550             dump_printf_loc (MSG_NOTE, vect_location,
5551                              "extract scalar result\n");
5552
5553           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5554                         bitsize, bitsize_zero_node);
5555           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5556           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5557           gimple_assign_set_lhs (epilog_stmt, new_temp);
5558           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5559           scalar_results.safe_push (new_temp);
5560         }
5561       else
5562         {
5563           /* Case 3: Create:
5564              s = extract_field <v_out2, 0>
5565              for (offset = element_size;
5566                   offset < vector_size;
5567                   offset += element_size;)
5568                {
5569                  Create:  s' = extract_field <v_out2, offset>
5570                  Create:  s = op <s, s'>  // For non SLP cases
5571                }  */
5572
5573           if (dump_enabled_p ())
5574             dump_printf_loc (MSG_NOTE, vect_location,
5575                              "Reduce using scalar code.\n");
5576
5577           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5578           int element_bitsize = tree_to_uhwi (bitsize);
5579           tree compute_type = TREE_TYPE (vectype);
5580           gimple_seq stmts = NULL;
5581           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5582             {
5583               int bit_offset;
5584               if (gimple_code (new_phi) == GIMPLE_PHI)
5585                 vec_temp = PHI_RESULT (new_phi);
5586               else
5587                 vec_temp = gimple_assign_lhs (new_phi);
5588               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5589                                        vec_temp, bitsize, bitsize_zero_node);
5590
5591               /* In SLP we don't need to apply reduction operation, so we just
5592                  collect s' values in SCALAR_RESULTS.  */
5593               if (slp_reduc)
5594                 scalar_results.safe_push (new_temp);
5595
5596               for (bit_offset = element_bitsize;
5597                    bit_offset < vec_size_in_bits;
5598                    bit_offset += element_bitsize)
5599                 {
5600                   tree bitpos = bitsize_int (bit_offset);
5601                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5602                                            compute_type, vec_temp,
5603                                            bitsize, bitpos);
5604                   if (slp_reduc)
5605                     {
5606                       /* In SLP we don't need to apply reduction operation, so
5607                          we just collect s' values in SCALAR_RESULTS.  */
5608                       new_temp = new_name;
5609                       scalar_results.safe_push (new_name);
5610                     }
5611                   else
5612                     new_temp = gimple_build (&stmts, code, compute_type,
5613                                              new_name, new_temp);
5614                 }
5615             }
5616
5617           /* The only case where we need to reduce scalar results in SLP, is
5618              unrolling.  If the size of SCALAR_RESULTS is greater than
5619              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5620              REDUC_GROUP_SIZE.  */
5621           if (slp_reduc)
5622             {
5623               tree res, first_res, new_res;
5624
5625               /* Reduce multiple scalar results in case of SLP unrolling.  */
5626               for (j = group_size; scalar_results.iterate (j, &res);
5627                    j++)
5628                 {
5629                   first_res = scalar_results[j % group_size];
5630                   new_res = gimple_build (&stmts, code, compute_type,
5631                                           first_res, res);
5632                   scalar_results[j % group_size] = new_res;
5633                 }
5634               for (k = 0; k < group_size; k++)
5635                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5636                                                     scalar_results[k]);
5637             }
5638           else
5639             {
5640               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5641               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5642               scalar_results.safe_push (new_temp);
5643             }
5644
5645           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5646         }
5647
5648       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5649           && induc_val)
5650         {
5651           /* Earlier we set the initial value to be a vector if induc_val
5652              values.  Check the result and if it is induc_val then replace
5653              with the original initial value, unless induc_val is
5654              the same as initial_def already.  */
5655           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5656                                   induc_val);
5657
5658           tree tmp = make_ssa_name (new_scalar_dest);
5659           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5660                                              initial_def, new_temp);
5661           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5662           scalar_results[0] = tmp;
5663         }
5664     }
5665
5666   /* 2.5 Adjust the final result by the initial value of the reduction
5667          variable. (When such adjustment is not needed, then
5668          'adjustment_def' is zero).  For example, if code is PLUS we create:
5669          new_temp = loop_exit_def + adjustment_def  */
5670
5671   if (adjustment_def)
5672     {
5673       gcc_assert (!slp_reduc);
5674       gimple_seq stmts = NULL;
5675       if (nested_in_vect_loop)
5676         {
5677           new_phi = new_phis[0];
5678           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5679           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5680           new_temp = gimple_build (&stmts, code, vectype,
5681                                    PHI_RESULT (new_phi), adjustment_def);
5682         }
5683       else
5684         {
5685           new_temp = scalar_results[0];
5686           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5687           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5688           new_temp = gimple_build (&stmts, code, scalar_type,
5689                                    new_temp, adjustment_def);
5690         }
5691
5692       epilog_stmt = gimple_seq_last_stmt (stmts);
5693       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5694       if (nested_in_vect_loop)
5695         {
5696           if (!double_reduc)
5697             scalar_results.quick_push (new_temp);
5698           else
5699             scalar_results[0] = new_temp;
5700         }
5701       else
5702         scalar_results[0] = new_temp;
5703
5704       new_phis[0] = epilog_stmt;
5705     }
5706
5707   if (double_reduc)
5708     loop = loop->inner;
5709
5710   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5711           phis with new adjusted scalar results, i.e., replace use <s_out0>
5712           with use <s_out4>.
5713
5714      Transform:
5715         loop_exit:
5716           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5717           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5718           v_out2 = reduce <v_out1>
5719           s_out3 = extract_field <v_out2, 0>
5720           s_out4 = adjust_result <s_out3>
5721           use <s_out0>
5722           use <s_out0>
5723
5724      into:
5725
5726         loop_exit:
5727           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5728           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5729           v_out2 = reduce <v_out1>
5730           s_out3 = extract_field <v_out2, 0>
5731           s_out4 = adjust_result <s_out3>
5732           use <s_out4>
5733           use <s_out4> */
5734
5735
5736   /* In SLP reduction chain we reduce vector results into one vector if
5737      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5738      LHS of the last stmt in the reduction chain, since we are looking for
5739      the loop exit phi node.  */
5740   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5741     {
5742       stmt_vec_info dest_stmt_info
5743         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5744       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5745       group_size = 1;
5746     }
5747
5748   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5749      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5750      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5751      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5752      correspond to the first vector stmt, etc.
5753      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5754   if (group_size > new_phis.length ())
5755     gcc_assert (!(group_size % new_phis.length ()));
5756
5757   for (k = 0; k < group_size; k++)
5758     {
5759       if (slp_reduc)
5760         {
5761           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5762
5763           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5764           /* SLP statements can't participate in patterns.  */
5765           gcc_assert (!orig_stmt_info);
5766           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5767         }
5768
5769       if (nested_in_vect_loop)
5770         {
5771           if (double_reduc)
5772             loop = outer_loop;
5773           else
5774             gcc_unreachable ();
5775         }
5776
5777       phis.create (3);
5778       /* Find the loop-closed-use at the loop exit of the original scalar
5779          result.  (The reduction result is expected to have two immediate uses,
5780          one at the latch block, and one at the loop exit).  For double
5781          reductions we are looking for exit phis of the outer loop.  */
5782       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5783         {
5784           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5785             {
5786               if (!is_gimple_debug (USE_STMT (use_p)))
5787                 phis.safe_push (USE_STMT (use_p));
5788             }
5789           else
5790             {
5791               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5792                 {
5793                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5794
5795                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5796                     {
5797                       if (!flow_bb_inside_loop_p (loop,
5798                                              gimple_bb (USE_STMT (phi_use_p)))
5799                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5800                         phis.safe_push (USE_STMT (phi_use_p));
5801                     }
5802                 }
5803             }
5804         }
5805
5806       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5807         {
5808           /* Replace the uses:  */
5809           orig_name = PHI_RESULT (exit_phi);
5810           scalar_result = scalar_results[k];
5811           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5812             {
5813               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5814                 SET_USE (use_p, scalar_result);
5815               update_stmt (use_stmt);
5816             }
5817         }
5818
5819       phis.release ();
5820     }
5821 }
5822
5823 /* Return a vector of type VECTYPE that is equal to the vector select
5824    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5825    before GSI.  */
5826
5827 static tree
5828 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5829                      tree vec, tree identity)
5830 {
5831   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5832   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5833                                           mask, vec, identity);
5834   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5835   return cond;
5836 }
5837
5838 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5839    order, starting with LHS.  Insert the extraction statements before GSI and
5840    associate the new scalar SSA names with variable SCALAR_DEST.
5841    Return the SSA name for the result.  */
5842
5843 static tree
5844 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5845                        tree_code code, tree lhs, tree vector_rhs)
5846 {
5847   tree vectype = TREE_TYPE (vector_rhs);
5848   tree scalar_type = TREE_TYPE (vectype);
5849   tree bitsize = TYPE_SIZE (scalar_type);
5850   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5851   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5852
5853   for (unsigned HOST_WIDE_INT bit_offset = 0;
5854        bit_offset < vec_size_in_bits;
5855        bit_offset += element_bitsize)
5856     {
5857       tree bitpos = bitsize_int (bit_offset);
5858       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5859                          bitsize, bitpos);
5860
5861       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5862       rhs = make_ssa_name (scalar_dest, stmt);
5863       gimple_assign_set_lhs (stmt, rhs);
5864       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5865
5866       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5867       tree new_name = make_ssa_name (scalar_dest, stmt);
5868       gimple_assign_set_lhs (stmt, new_name);
5869       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5870       lhs = new_name;
5871     }
5872   return lhs;
5873 }
5874
5875 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5876    type of the vector input.  */
5877
5878 static internal_fn
5879 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5880 {
5881   internal_fn mask_reduc_fn;
5882
5883   switch (reduc_fn)
5884     {
5885     case IFN_FOLD_LEFT_PLUS:
5886       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5887       break;
5888
5889     default:
5890       return IFN_LAST;
5891     }
5892
5893   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5894                                       OPTIMIZE_FOR_SPEED))
5895     return mask_reduc_fn;
5896   return IFN_LAST;
5897 }
5898
5899 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5900    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5901    statement.  CODE is the operation performed by STMT_INFO and OPS are
5902    its scalar operands.  REDUC_INDEX is the index of the operand in
5903    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5904    implements in-order reduction, or IFN_LAST if we should open-code it.
5905    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5906    that should be used to control the operation in a fully-masked loop.  */
5907
5908 static bool
5909 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5910                                stmt_vec_info stmt_info,
5911                                gimple_stmt_iterator *gsi,
5912                                gimple **vec_stmt, slp_tree slp_node,
5913                                gimple *reduc_def_stmt,
5914                                tree_code code, internal_fn reduc_fn,
5915                                tree ops[3], tree vectype_in,
5916                                int reduc_index, vec_loop_masks *masks)
5917 {
5918   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5919   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5920   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5921
5922   int ncopies;
5923   if (slp_node)
5924     ncopies = 1;
5925   else
5926     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5927
5928   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5929   gcc_assert (ncopies == 1);
5930   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5931
5932   if (slp_node)
5933     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5934                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5935
5936   tree op0 = ops[1 - reduc_index];
5937
5938   int group_size = 1;
5939   stmt_vec_info scalar_dest_def_info;
5940   auto_vec<tree> vec_oprnds0;
5941   if (slp_node)
5942     {
5943       auto_vec<vec<tree> > vec_defs (2);
5944       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5945       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5946       vec_defs[0].release ();
5947       vec_defs[1].release ();
5948       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5949       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5950     }
5951   else
5952     {
5953       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5954                                      op0, &vec_oprnds0);
5955       scalar_dest_def_info = stmt_info;
5956     }
5957
5958   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5959   tree scalar_type = TREE_TYPE (scalar_dest);
5960   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5961
5962   int vec_num = vec_oprnds0.length ();
5963   gcc_assert (vec_num == 1 || slp_node);
5964   tree vec_elem_type = TREE_TYPE (vectype_out);
5965   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5966
5967   tree vector_identity = NULL_TREE;
5968   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5969     vector_identity = build_zero_cst (vectype_out);
5970
5971   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5972   int i;
5973   tree def0;
5974   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5975     {
5976       gimple *new_stmt;
5977       tree mask = NULL_TREE;
5978       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5979         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5980
5981       /* Handle MINUS by adding the negative.  */
5982       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5983         {
5984           tree negated = make_ssa_name (vectype_out);
5985           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5986           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5987           def0 = negated;
5988         }
5989
5990       if (mask && mask_reduc_fn == IFN_LAST)
5991         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5992                                     vector_identity);
5993
5994       /* On the first iteration the input is simply the scalar phi
5995          result, and for subsequent iterations it is the output of
5996          the preceding operation.  */
5997       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5998         {
5999           if (mask && mask_reduc_fn != IFN_LAST)
6000             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6001                                                    def0, mask);
6002           else
6003             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6004                                                    def0);
6005           /* For chained SLP reductions the output of the previous reduction
6006              operation serves as the input of the next. For the final statement
6007              the output cannot be a temporary - we reuse the original
6008              scalar destination of the last statement.  */
6009           if (i != vec_num - 1)
6010             {
6011               gimple_set_lhs (new_stmt, scalar_dest_var);
6012               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6013               gimple_set_lhs (new_stmt, reduc_var);
6014             }
6015         }
6016       else
6017         {
6018           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6019                                              reduc_var, def0);
6020           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6021           /* Remove the statement, so that we can use the same code paths
6022              as for statements that we've just created.  */
6023           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6024           gsi_remove (&tmp_gsi, true);
6025         }
6026
6027       if (i == vec_num - 1)
6028         {
6029           gimple_set_lhs (new_stmt, scalar_dest);
6030           vect_finish_replace_stmt (loop_vinfo,
6031                                     scalar_dest_def_info,
6032                                     new_stmt);
6033         }
6034       else
6035         vect_finish_stmt_generation (loop_vinfo,
6036                                      scalar_dest_def_info,
6037                                      new_stmt, gsi);
6038
6039       if (slp_node)
6040         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6041       else
6042         {
6043           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6044           *vec_stmt = new_stmt;
6045         }
6046     }
6047
6048   return true;
6049 }
6050
6051 /* Function is_nonwrapping_integer_induction.
6052
6053    Check if STMT_VINO (which is part of loop LOOP) both increments and
6054    does not cause overflow.  */
6055
6056 static bool
6057 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6058 {
6059   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6060   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6061   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6062   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6063   widest_int ni, max_loop_value, lhs_max;
6064   wi::overflow_type overflow = wi::OVF_NONE;
6065
6066   /* Make sure the loop is integer based.  */
6067   if (TREE_CODE (base) != INTEGER_CST
6068       || TREE_CODE (step) != INTEGER_CST)
6069     return false;
6070
6071   /* Check that the max size of the loop will not wrap.  */
6072
6073   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6074     return true;
6075
6076   if (! max_stmt_executions (loop, &ni))
6077     return false;
6078
6079   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6080                             &overflow);
6081   if (overflow)
6082     return false;
6083
6084   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6085                             TYPE_SIGN (lhs_type), &overflow);
6086   if (overflow)
6087     return false;
6088
6089   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6090           <= TYPE_PRECISION (lhs_type));
6091 }
6092
6093 /* Check if masking can be supported by inserting a conditional expression.
6094    CODE is the code for the operation.  COND_FN is the conditional internal
6095    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6096 static bool
6097 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6098                          tree vectype_in)
6099 {
6100   if (cond_fn != IFN_LAST
6101       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6102                                          OPTIMIZE_FOR_SPEED))
6103     return false;
6104
6105   switch (code)
6106     {
6107     case DOT_PROD_EXPR:
6108     case SAD_EXPR:
6109       return true;
6110
6111     default:
6112       return false;
6113     }
6114 }
6115
6116 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6117    code for the operation.  VOP is the array of operands.  MASK is the loop
6118    mask.  GSI is a statement iterator used to place the new conditional
6119    expression.  */
6120 static void
6121 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6122                       gimple_stmt_iterator *gsi)
6123 {
6124   switch (code)
6125     {
6126     case DOT_PROD_EXPR:
6127       {
6128         tree vectype = TREE_TYPE (vop[1]);
6129         tree zero = build_zero_cst (vectype);
6130         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6131         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6132                                                mask, vop[1], zero);
6133         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6134         vop[1] = masked_op1;
6135         break;
6136       }
6137
6138     case SAD_EXPR:
6139       {
6140         tree vectype = TREE_TYPE (vop[1]);
6141         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6142         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6143                                                mask, vop[1], vop[0]);
6144         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6145         vop[1] = masked_op1;
6146         break;
6147       }
6148
6149     default:
6150       gcc_unreachable ();
6151     }
6152 }
6153
6154 /* Function vectorizable_reduction.
6155
6156    Check if STMT_INFO performs a reduction operation that can be vectorized.
6157    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6158    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6159    Return true if STMT_INFO is vectorizable in this way.
6160
6161    This function also handles reduction idioms (patterns) that have been
6162    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6163    may be of this form:
6164      X = pattern_expr (arg0, arg1, ..., X)
6165    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6166    sequence that had been detected and replaced by the pattern-stmt
6167    (STMT_INFO).
6168
6169    This function also handles reduction of condition expressions, for example:
6170      for (int i = 0; i < N; i++)
6171        if (a[i] < value)
6172          last = a[i];
6173    This is handled by vectorising the loop and creating an additional vector
6174    containing the loop indexes for which "a[i] < value" was true.  In the
6175    function epilogue this is reduced to a single max value and then used to
6176    index into the vector of results.
6177
6178    In some cases of reduction patterns, the type of the reduction variable X is
6179    different than the type of the other arguments of STMT_INFO.
6180    In such cases, the vectype that is used when transforming STMT_INFO into
6181    a vector stmt is different than the vectype that is used to determine the
6182    vectorization factor, because it consists of a different number of elements
6183    than the actual number of elements that are being operated upon in parallel.
6184
6185    For example, consider an accumulation of shorts into an int accumulator.
6186    On some targets it's possible to vectorize this pattern operating on 8
6187    shorts at a time (hence, the vectype for purposes of determining the
6188    vectorization factor should be V8HI); on the other hand, the vectype that
6189    is used to create the vector form is actually V4SI (the type of the result).
6190
6191    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6192    indicates what is the actual level of parallelism (V8HI in the example), so
6193    that the right vectorization factor would be derived.  This vectype
6194    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6195    be used to create the vectorized stmt.  The right vectype for the vectorized
6196    stmt is obtained from the type of the result X:
6197       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6198
6199    This means that, contrary to "regular" reductions (or "regular" stmts in
6200    general), the following equation:
6201       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6202    does *NOT* necessarily hold for reduction patterns.  */
6203
6204 bool
6205 vectorizable_reduction (loop_vec_info loop_vinfo,
6206                         stmt_vec_info stmt_info, slp_tree slp_node,
6207                         slp_instance slp_node_instance,
6208                         stmt_vector_for_cost *cost_vec)
6209 {
6210   tree scalar_dest;
6211   tree vectype_in = NULL_TREE;
6212   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6213   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6214   stmt_vec_info cond_stmt_vinfo = NULL;
6215   tree scalar_type;
6216   int i;
6217   int ncopies;
6218   bool single_defuse_cycle = false;
6219   bool nested_cycle = false;
6220   bool double_reduc = false;
6221   int vec_num;
6222   tree tem;
6223   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6224   tree cond_reduc_val = NULL_TREE;
6225
6226   /* Make sure it was already recognized as a reduction computation.  */
6227   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6228       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6229       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6230     return false;
6231
6232   /* The stmt we store reduction analysis meta on.  */
6233   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6234   reduc_info->is_reduc_info = true;
6235
6236   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6237     {
6238       if (is_a <gphi *> (stmt_info->stmt))
6239         /* Analysis for double-reduction is done on the outer
6240            loop PHI, nested cycles have no further restrictions.  */
6241         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6242       else
6243         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6244       return true;
6245     }
6246
6247   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6248   stmt_vec_info phi_info = stmt_info;
6249   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6250       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6251     {
6252       if (!is_a <gphi *> (stmt_info->stmt))
6253         {
6254           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6255           return true;
6256         }
6257       if (slp_node)
6258         {
6259           slp_node_instance->reduc_phis = slp_node;
6260           /* ???  We're leaving slp_node to point to the PHIs, we only
6261              need it to get at the number of vector stmts which wasn't
6262              yet initialized for the instance root.  */
6263         }
6264       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6265         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6266       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6267         {
6268           use_operand_p use_p;
6269           gimple *use_stmt;
6270           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6271                                      &use_p, &use_stmt);
6272           gcc_assert (res);
6273           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6274           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6275         }
6276     }
6277
6278   /* PHIs should not participate in patterns.  */
6279   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6280   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6281
6282   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6283      and compute the reduction chain length.  */
6284   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6285                                           loop_latch_edge (loop));
6286   unsigned reduc_chain_length = 0;
6287   bool only_slp_reduc_chain = true;
6288   stmt_info = NULL;
6289   while (reduc_def != PHI_RESULT (reduc_def_phi))
6290     {
6291       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6292       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6293       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6294         {
6295           if (dump_enabled_p ())
6296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6297                              "reduction chain broken by patterns.\n");
6298           return false;
6299         }
6300       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6301         only_slp_reduc_chain = false;
6302       /* ???  For epilogue generation live members of the chain need
6303          to point back to the PHI via their original stmt for
6304          info_for_reduction to work.  */
6305       if (STMT_VINFO_LIVE_P (vdef))
6306         STMT_VINFO_REDUC_DEF (def) = phi_info;
6307       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6308       if (!assign)
6309         {
6310           if (dump_enabled_p ())
6311             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6312                              "reduction chain includes calls.\n");
6313           return false;
6314         }
6315       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6316         {
6317           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6318                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6319             {
6320               if (dump_enabled_p ())
6321                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6322                                  "conversion in the reduction chain.\n");
6323               return false;
6324             }
6325         }
6326       else if (!stmt_info)
6327         /* First non-conversion stmt.  */
6328         stmt_info = vdef;
6329       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6330       reduc_chain_length++;
6331     }
6332   /* PHIs should not participate in patterns.  */
6333   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6334
6335   if (nested_in_vect_loop_p (loop, stmt_info))
6336     {
6337       loop = loop->inner;
6338       nested_cycle = true;
6339     }
6340
6341   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6342      element.  */
6343   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6344     {
6345       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6346       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6347     }
6348   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6349     gcc_assert (slp_node
6350                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6351
6352   /* 1. Is vectorizable reduction?  */
6353   /* Not supportable if the reduction variable is used in the loop, unless
6354      it's a reduction chain.  */
6355   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6356       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6357     return false;
6358
6359   /* Reductions that are not used even in an enclosing outer-loop,
6360      are expected to be "live" (used out of the loop).  */
6361   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6362       && !STMT_VINFO_LIVE_P (stmt_info))
6363     return false;
6364
6365   /* 2. Has this been recognized as a reduction pattern?
6366
6367      Check if STMT represents a pattern that has been recognized
6368      in earlier analysis stages.  For stmts that represent a pattern,
6369      the STMT_VINFO_RELATED_STMT field records the last stmt in
6370      the original sequence that constitutes the pattern.  */
6371
6372   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6373   if (orig_stmt_info)
6374     {
6375       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6376       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6377     }
6378
6379   /* 3. Check the operands of the operation.  The first operands are defined
6380         inside the loop body. The last operand is the reduction variable,
6381         which is defined by the loop-header-phi.  */
6382
6383   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6384   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6385   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6386   enum tree_code code = gimple_assign_rhs_code (stmt);
6387   bool lane_reduc_code_p
6388     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6389   int op_type = TREE_CODE_LENGTH (code);
6390
6391   scalar_dest = gimple_assign_lhs (stmt);
6392   scalar_type = TREE_TYPE (scalar_dest);
6393   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6394       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6395     return false;
6396
6397   /* Do not try to vectorize bit-precision reductions.  */
6398   if (!type_has_mode_precision_p (scalar_type))
6399     return false;
6400
6401   /* For lane-reducing ops we're reducing the number of reduction PHIs
6402      which means the only use of that may be in the lane-reducing operation.  */
6403   if (lane_reduc_code_p
6404       && reduc_chain_length != 1
6405       && !only_slp_reduc_chain)
6406     {
6407       if (dump_enabled_p ())
6408         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6409                          "lane-reducing reduction with extra stmts.\n");
6410       return false;
6411     }
6412
6413   /* All uses but the last are expected to be defined in the loop.
6414      The last use is the reduction variable.  In case of nested cycle this
6415      assumption is not true: we use reduc_index to record the index of the
6416      reduction variable.  */
6417   /* ???  To get at invariant/constant uses on the SLP node we have to
6418      get to it here, slp_node is still the reduction PHI.  */
6419   slp_tree slp_for_stmt_info = NULL;
6420   if (slp_node)
6421     {
6422       slp_for_stmt_info = slp_node_instance->root;
6423       /* And then there's reduction chain with a conversion ...  */
6424       if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6425         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6426       gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6427     }
6428   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6429   /* We need to skip an extra operand for COND_EXPRs with embedded
6430      comparison.  */
6431   unsigned opno_adjust = 0;
6432   if (code == COND_EXPR
6433       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6434     opno_adjust = 1;
6435   for (i = 0; i < op_type; i++)
6436     {
6437       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6438       if (i == 0 && code == COND_EXPR)
6439         continue;
6440
6441       stmt_vec_info def_stmt_info;
6442       enum vect_def_type dt;
6443       tree op;
6444       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6445                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6446                                &def_stmt_info))
6447         {
6448           if (dump_enabled_p ())
6449             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6450                              "use not simple.\n");
6451           return false;
6452         }
6453       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6454         continue;
6455
6456       /* There should be only one cycle def in the stmt, the one
6457          leading to reduc_def.  */
6458       if (VECTORIZABLE_CYCLE_DEF (dt))
6459         return false;
6460
6461       /* To properly compute ncopies we are interested in the widest
6462          non-reduction input type in case we're looking at a widening
6463          accumulation that we later handle in vect_transform_reduction.  */
6464       if (lane_reduc_code_p
6465           && tem
6466           && (!vectype_in
6467               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6468                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6469         vectype_in = tem;
6470
6471       if (code == COND_EXPR)
6472         {
6473           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6474           if (dt == vect_constant_def)
6475             {
6476               cond_reduc_dt = dt;
6477               cond_reduc_val = op;
6478             }
6479           if (dt == vect_induction_def
6480               && def_stmt_info
6481               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6482             {
6483               cond_reduc_dt = dt;
6484               cond_stmt_vinfo = def_stmt_info;
6485             }
6486         }
6487     }
6488   if (!vectype_in)
6489     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6490   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6491
6492   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6493   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6494   /* If we have a condition reduction, see if we can simplify it further.  */
6495   if (v_reduc_type == COND_REDUCTION)
6496     {
6497       if (slp_node)
6498         return false;
6499
6500       /* When the condition uses the reduction value in the condition, fail.  */
6501       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6502         {
6503           if (dump_enabled_p ())
6504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6505                              "condition depends on previous iteration\n");
6506           return false;
6507         }
6508
6509       if (reduc_chain_length == 1
6510           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6511                                              vectype_in, OPTIMIZE_FOR_SPEED))
6512         {
6513           if (dump_enabled_p ())
6514             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6515                              "optimizing condition reduction with"
6516                              " FOLD_EXTRACT_LAST.\n");
6517           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6518         }
6519       else if (cond_reduc_dt == vect_induction_def)
6520         {
6521           tree base
6522             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6523           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6524
6525           gcc_assert (TREE_CODE (base) == INTEGER_CST
6526                       && TREE_CODE (step) == INTEGER_CST);
6527           cond_reduc_val = NULL_TREE;
6528           enum tree_code cond_reduc_op_code = ERROR_MARK;
6529           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6530           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6531             ;
6532           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6533              above base; punt if base is the minimum value of the type for
6534              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6535           else if (tree_int_cst_sgn (step) == -1)
6536             {
6537               cond_reduc_op_code = MIN_EXPR;
6538               if (tree_int_cst_sgn (base) == -1)
6539                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6540               else if (tree_int_cst_lt (base,
6541                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6542                 cond_reduc_val
6543                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6544             }
6545           else
6546             {
6547               cond_reduc_op_code = MAX_EXPR;
6548               if (tree_int_cst_sgn (base) == 1)
6549                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6550               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6551                                         base))
6552                 cond_reduc_val
6553                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6554             }
6555           if (cond_reduc_val)
6556             {
6557               if (dump_enabled_p ())
6558                 dump_printf_loc (MSG_NOTE, vect_location,
6559                                  "condition expression based on "
6560                                  "integer induction.\n");
6561               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6562               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6563                 = cond_reduc_val;
6564               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6565             }
6566         }
6567       else if (cond_reduc_dt == vect_constant_def)
6568         {
6569           enum vect_def_type cond_initial_dt;
6570           tree cond_initial_val
6571             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6572
6573           gcc_assert (cond_reduc_val != NULL_TREE);
6574           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6575           if (cond_initial_dt == vect_constant_def
6576               && types_compatible_p (TREE_TYPE (cond_initial_val),
6577                                      TREE_TYPE (cond_reduc_val)))
6578             {
6579               tree e = fold_binary (LE_EXPR, boolean_type_node,
6580                                     cond_initial_val, cond_reduc_val);
6581               if (e && (integer_onep (e) || integer_zerop (e)))
6582                 {
6583                   if (dump_enabled_p ())
6584                     dump_printf_loc (MSG_NOTE, vect_location,
6585                                      "condition expression based on "
6586                                      "compile time constant.\n");
6587                   /* Record reduction code at analysis stage.  */
6588                   STMT_VINFO_REDUC_CODE (reduc_info)
6589                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6590                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6591                 }
6592             }
6593         }
6594     }
6595
6596   if (STMT_VINFO_LIVE_P (phi_info))
6597     return false;
6598
6599   if (slp_node)
6600     ncopies = 1;
6601   else
6602     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6603
6604   gcc_assert (ncopies >= 1);
6605
6606   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6607
6608   if (nested_cycle)
6609     {
6610       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6611                   == vect_double_reduction_def);
6612       double_reduc = true;
6613     }
6614
6615   /* 4.2. Check support for the epilog operation.
6616
6617           If STMT represents a reduction pattern, then the type of the
6618           reduction variable may be different than the type of the rest
6619           of the arguments.  For example, consider the case of accumulation
6620           of shorts into an int accumulator; The original code:
6621                         S1: int_a = (int) short_a;
6622           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6623
6624           was replaced with:
6625                         STMT: int_acc = widen_sum <short_a, int_acc>
6626
6627           This means that:
6628           1. The tree-code that is used to create the vector operation in the
6629              epilog code (that reduces the partial results) is not the
6630              tree-code of STMT, but is rather the tree-code of the original
6631              stmt from the pattern that STMT is replacing.  I.e, in the example
6632              above we want to use 'widen_sum' in the loop, but 'plus' in the
6633              epilog.
6634           2. The type (mode) we use to check available target support
6635              for the vector operation to be created in the *epilog*, is
6636              determined by the type of the reduction variable (in the example
6637              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6638              However the type (mode) we use to check available target support
6639              for the vector operation to be created *inside the loop*, is
6640              determined by the type of the other arguments to STMT (in the
6641              example we'd check this: optab_handler (widen_sum_optab,
6642              vect_short_mode)).
6643
6644           This is contrary to "regular" reductions, in which the types of all
6645           the arguments are the same as the type of the reduction variable.
6646           For "regular" reductions we can therefore use the same vector type
6647           (and also the same tree-code) when generating the epilog code and
6648           when generating the code inside the loop.  */
6649
6650   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6651   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6652
6653   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6654   if (reduction_type == TREE_CODE_REDUCTION)
6655     {
6656       /* Check whether it's ok to change the order of the computation.
6657          Generally, when vectorizing a reduction we change the order of the
6658          computation.  This may change the behavior of the program in some
6659          cases, so we need to check that this is ok.  One exception is when
6660          vectorizing an outer-loop: the inner-loop is executed sequentially,
6661          and therefore vectorizing reductions in the inner-loop during
6662          outer-loop vectorization is safe.  */
6663       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6664         {
6665           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6666              is not directy used in stmt.  */
6667           if (!only_slp_reduc_chain
6668               && reduc_chain_length != 1)
6669             {
6670               if (dump_enabled_p ())
6671                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6672                                  "in-order reduction chain without SLP.\n");
6673               return false;
6674             }
6675           STMT_VINFO_REDUC_TYPE (reduc_info)
6676             = reduction_type = FOLD_LEFT_REDUCTION;
6677         }
6678       else if (!commutative_tree_code (orig_code)
6679                || !associative_tree_code (orig_code))
6680         {
6681           if (dump_enabled_p ())
6682             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6683                             "reduction: not commutative/associative");
6684           return false;
6685         }
6686     }
6687
6688   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6689       && ncopies > 1)
6690     {
6691       if (dump_enabled_p ())
6692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6693                          "multiple types in double reduction or condition "
6694                          "reduction or fold-left reduction.\n");
6695       return false;
6696     }
6697
6698   internal_fn reduc_fn = IFN_LAST;
6699   if (reduction_type == TREE_CODE_REDUCTION
6700       || reduction_type == FOLD_LEFT_REDUCTION
6701       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6702       || reduction_type == CONST_COND_REDUCTION)
6703     {
6704       if (reduction_type == FOLD_LEFT_REDUCTION
6705           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6706           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6707         {
6708           if (reduc_fn != IFN_LAST
6709               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6710                                                   OPTIMIZE_FOR_SPEED))
6711             {
6712               if (dump_enabled_p ())
6713                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714                                  "reduc op not supported by target.\n");
6715
6716               reduc_fn = IFN_LAST;
6717             }
6718         }
6719       else
6720         {
6721           if (!nested_cycle || double_reduc)
6722             {
6723               if (dump_enabled_p ())
6724                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725                                  "no reduc code for scalar code.\n");
6726
6727               return false;
6728             }
6729         }
6730     }
6731   else if (reduction_type == COND_REDUCTION)
6732     {
6733       int scalar_precision
6734         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6735       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6736       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6737                                                 nunits_out);
6738
6739       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6740                                           OPTIMIZE_FOR_SPEED))
6741         reduc_fn = IFN_REDUC_MAX;
6742     }
6743   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6744
6745   if (reduction_type != EXTRACT_LAST_REDUCTION
6746       && (!nested_cycle || double_reduc)
6747       && reduc_fn == IFN_LAST
6748       && !nunits_out.is_constant ())
6749     {
6750       if (dump_enabled_p ())
6751         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6752                          "missing target support for reduction on"
6753                          " variable-length vectors.\n");
6754       return false;
6755     }
6756
6757   /* For SLP reductions, see if there is a neutral value we can use.  */
6758   tree neutral_op = NULL_TREE;
6759   if (slp_node)
6760     neutral_op = neutral_op_for_slp_reduction
6761       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6762        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6763
6764   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6765     {
6766       /* We can't support in-order reductions of code such as this:
6767
6768            for (int i = 0; i < n1; ++i)
6769              for (int j = 0; j < n2; ++j)
6770                l += a[j];
6771
6772          since GCC effectively transforms the loop when vectorizing:
6773
6774            for (int i = 0; i < n1 / VF; ++i)
6775              for (int j = 0; j < n2; ++j)
6776                for (int k = 0; k < VF; ++k)
6777                  l += a[j];
6778
6779          which is a reassociation of the original operation.  */
6780       if (dump_enabled_p ())
6781         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782                          "in-order double reduction not supported.\n");
6783
6784       return false;
6785     }
6786
6787   if (reduction_type == FOLD_LEFT_REDUCTION
6788       && slp_node
6789       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6790     {
6791       /* We cannot use in-order reductions in this case because there is
6792          an implicit reassociation of the operations involved.  */
6793       if (dump_enabled_p ())
6794         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6795                          "in-order unchained SLP reductions not supported.\n");
6796       return false;
6797     }
6798
6799   /* For double reductions, and for SLP reductions with a neutral value,
6800      we construct a variable-length initial vector by loading a vector
6801      full of the neutral value and then shift-and-inserting the start
6802      values into the low-numbered elements.  */
6803   if ((double_reduc || neutral_op)
6804       && !nunits_out.is_constant ()
6805       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6806                                           vectype_out, OPTIMIZE_FOR_SPEED))
6807     {
6808       if (dump_enabled_p ())
6809         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810                          "reduction on variable-length vectors requires"
6811                          " target support for a vector-shift-and-insert"
6812                          " operation.\n");
6813       return false;
6814     }
6815
6816   /* Check extra constraints for variable-length unchained SLP reductions.  */
6817   if (STMT_SLP_TYPE (stmt_info)
6818       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6819       && !nunits_out.is_constant ())
6820     {
6821       /* We checked above that we could build the initial vector when
6822          there's a neutral element value.  Check here for the case in
6823          which each SLP statement has its own initial value and in which
6824          that value needs to be repeated for every instance of the
6825          statement within the initial vector.  */
6826       unsigned int group_size = SLP_TREE_LANES (slp_node);
6827       if (!neutral_op
6828           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6829                                               TREE_TYPE (vectype_out)))
6830         {
6831           if (dump_enabled_p ())
6832             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833                              "unsupported form of SLP reduction for"
6834                              " variable-length vectors: cannot build"
6835                              " initial vector.\n");
6836           return false;
6837         }
6838       /* The epilogue code relies on the number of elements being a multiple
6839          of the group size.  The duplicate-and-interleave approach to setting
6840          up the initial vector does too.  */
6841       if (!multiple_p (nunits_out, group_size))
6842         {
6843           if (dump_enabled_p ())
6844             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845                              "unsupported form of SLP reduction for"
6846                              " variable-length vectors: the vector size"
6847                              " is not a multiple of the number of results.\n");
6848           return false;
6849         }
6850     }
6851
6852   if (reduction_type == COND_REDUCTION)
6853     {
6854       widest_int ni;
6855
6856       if (! max_loop_iterations (loop, &ni))
6857         {
6858           if (dump_enabled_p ())
6859             dump_printf_loc (MSG_NOTE, vect_location,
6860                              "loop count not known, cannot create cond "
6861                              "reduction.\n");
6862           return false;
6863         }
6864       /* Convert backedges to iterations.  */
6865       ni += 1;
6866
6867       /* The additional index will be the same type as the condition.  Check
6868          that the loop can fit into this less one (because we'll use up the
6869          zero slot for when there are no matches).  */
6870       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6871       if (wi::geu_p (ni, wi::to_widest (max_index)))
6872         {
6873           if (dump_enabled_p ())
6874             dump_printf_loc (MSG_NOTE, vect_location,
6875                              "loop size is greater than data size.\n");
6876           return false;
6877         }
6878     }
6879
6880   /* In case the vectorization factor (VF) is bigger than the number
6881      of elements that we can fit in a vectype (nunits), we have to generate
6882      more than one vector stmt - i.e - we need to "unroll" the
6883      vector stmt by a factor VF/nunits.  For more details see documentation
6884      in vectorizable_operation.  */
6885
6886   /* If the reduction is used in an outer loop we need to generate
6887      VF intermediate results, like so (e.g. for ncopies=2):
6888         r0 = phi (init, r0)
6889         r1 = phi (init, r1)
6890         r0 = x0 + r0;
6891         r1 = x1 + r1;
6892     (i.e. we generate VF results in 2 registers).
6893     In this case we have a separate def-use cycle for each copy, and therefore
6894     for each copy we get the vector def for the reduction variable from the
6895     respective phi node created for this copy.
6896
6897     Otherwise (the reduction is unused in the loop nest), we can combine
6898     together intermediate results, like so (e.g. for ncopies=2):
6899         r = phi (init, r)
6900         r = x0 + r;
6901         r = x1 + r;
6902    (i.e. we generate VF/2 results in a single register).
6903    In this case for each copy we get the vector def for the reduction variable
6904    from the vectorized reduction operation generated in the previous iteration.
6905
6906    This only works when we see both the reduction PHI and its only consumer
6907    in vectorizable_reduction and there are no intermediate stmts
6908    participating.  */
6909   if (ncopies > 1
6910       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6911       && reduc_chain_length == 1)
6912     single_defuse_cycle = true;
6913
6914   if (single_defuse_cycle || lane_reduc_code_p)
6915     {
6916       gcc_assert (code != COND_EXPR);
6917
6918       /* 4. Supportable by target?  */
6919       bool ok = true;
6920
6921       /* 4.1. check support for the operation in the loop  */
6922       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6923       if (!optab)
6924         {
6925           if (dump_enabled_p ())
6926             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6927                              "no optab.\n");
6928           ok = false;
6929         }
6930
6931       machine_mode vec_mode = TYPE_MODE (vectype_in);
6932       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6933         {
6934           if (dump_enabled_p ())
6935             dump_printf (MSG_NOTE, "op not supported by target.\n");
6936           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6937               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6938             ok = false;
6939           else
6940             if (dump_enabled_p ())
6941               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6942         }
6943
6944       /* Worthwhile without SIMD support?  */
6945       if (ok
6946           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6947           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6948         {
6949           if (dump_enabled_p ())
6950             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6951                              "not worthwhile without SIMD support.\n");
6952           ok = false;
6953         }
6954
6955       /* lane-reducing operations have to go through vect_transform_reduction.
6956          For the other cases try without the single cycle optimization.  */
6957       if (!ok)
6958         {
6959           if (lane_reduc_code_p)
6960             return false;
6961           else
6962             single_defuse_cycle = false;
6963         }
6964     }
6965   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6966
6967   /* If the reduction stmt is one of the patterns that have lane
6968      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6969   if ((ncopies > 1 && ! single_defuse_cycle)
6970       && lane_reduc_code_p)
6971     {
6972       if (dump_enabled_p ())
6973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6974                          "multi def-use cycle not possible for lane-reducing "
6975                          "reduction operation\n");
6976       return false;
6977     }
6978
6979   if (slp_node
6980       && !(!single_defuse_cycle
6981            && code != DOT_PROD_EXPR
6982            && code != WIDEN_SUM_EXPR
6983            && code != SAD_EXPR
6984            && reduction_type != FOLD_LEFT_REDUCTION))
6985     for (i = 0; i < op_type; i++)
6986       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6987         {
6988           if (dump_enabled_p ())
6989             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6990                              "incompatible vector types for invariants\n");
6991           return false;
6992         }
6993
6994   if (slp_node)
6995     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6996   else
6997     vec_num = 1;
6998
6999   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7000                              reduction_type, ncopies, cost_vec);
7001   if (dump_enabled_p ()
7002       && reduction_type == FOLD_LEFT_REDUCTION)
7003     dump_printf_loc (MSG_NOTE, vect_location,
7004                      "using an in-order (fold-left) reduction.\n");
7005   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7006   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7007      reductions go through their own vectorizable_* routines.  */
7008   if (!single_defuse_cycle
7009       && code != DOT_PROD_EXPR
7010       && code != WIDEN_SUM_EXPR
7011       && code != SAD_EXPR
7012       && reduction_type != FOLD_LEFT_REDUCTION)
7013     {
7014       stmt_vec_info tem
7015         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7016       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7017         {
7018           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7019           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7020         }
7021       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7022       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7023     }
7024   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7025     {
7026       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7027       internal_fn cond_fn = get_conditional_internal_fn (code);
7028
7029       if (reduction_type != FOLD_LEFT_REDUCTION
7030           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7031           && (cond_fn == IFN_LAST
7032               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7033                                                   OPTIMIZE_FOR_SPEED)))
7034         {
7035           if (dump_enabled_p ())
7036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037                              "can't operate on partial vectors because"
7038                              " no conditional operation is available.\n");
7039           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7040         }
7041       else if (reduction_type == FOLD_LEFT_REDUCTION
7042                && reduc_fn == IFN_LAST
7043                && !expand_vec_cond_expr_p (vectype_in,
7044                                            truth_type_for (vectype_in),
7045                                            SSA_NAME))
7046         {
7047           if (dump_enabled_p ())
7048             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7049                              "can't operate on partial vectors because"
7050                              " no conditional operation is available.\n");
7051           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7052         }
7053       else
7054         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7055                                vectype_in, NULL);
7056     }
7057   return true;
7058 }
7059
7060 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7061    value.  */
7062
7063 bool
7064 vect_transform_reduction (loop_vec_info loop_vinfo,
7065                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7066                           gimple **vec_stmt, slp_tree slp_node)
7067 {
7068   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7069   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7070   int i;
7071   int ncopies;
7072   int vec_num;
7073
7074   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7075   gcc_assert (reduc_info->is_reduc_info);
7076
7077   if (nested_in_vect_loop_p (loop, stmt_info))
7078     {
7079       loop = loop->inner;
7080       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7081     }
7082
7083   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7084   enum tree_code code = gimple_assign_rhs_code (stmt);
7085   int op_type = TREE_CODE_LENGTH (code);
7086
7087   /* Flatten RHS.  */
7088   tree ops[3];
7089   switch (get_gimple_rhs_class (code))
7090     {
7091     case GIMPLE_TERNARY_RHS:
7092       ops[2] = gimple_assign_rhs3 (stmt);
7093       /* Fall thru.  */
7094     case GIMPLE_BINARY_RHS:
7095       ops[0] = gimple_assign_rhs1 (stmt);
7096       ops[1] = gimple_assign_rhs2 (stmt);
7097       break;
7098     default:
7099       gcc_unreachable ();
7100     }
7101
7102   /* All uses but the last are expected to be defined in the loop.
7103      The last use is the reduction variable.  In case of nested cycle this
7104      assumption is not true: we use reduc_index to record the index of the
7105      reduction variable.  */
7106   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7107   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7108   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7109   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7110
7111   if (slp_node)
7112     {
7113       ncopies = 1;
7114       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7115     }
7116   else
7117     {
7118       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7119       vec_num = 1;
7120     }
7121
7122   internal_fn cond_fn = get_conditional_internal_fn (code);
7123   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7124   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7125
7126   /* Transform.  */
7127   tree new_temp = NULL_TREE;
7128   auto_vec<tree> vec_oprnds0;
7129   auto_vec<tree> vec_oprnds1;
7130   auto_vec<tree> vec_oprnds2;
7131   tree def0;
7132
7133   if (dump_enabled_p ())
7134     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7135
7136   /* FORNOW: Multiple types are not supported for condition.  */
7137   if (code == COND_EXPR)
7138     gcc_assert (ncopies == 1);
7139
7140   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7141
7142   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7143   if (reduction_type == FOLD_LEFT_REDUCTION)
7144     {
7145       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7146       return vectorize_fold_left_reduction
7147           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7148            reduc_fn, ops, vectype_in, reduc_index, masks);
7149     }
7150
7151   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7152   gcc_assert (single_defuse_cycle
7153               || code == DOT_PROD_EXPR
7154               || code == WIDEN_SUM_EXPR
7155               || code == SAD_EXPR);
7156
7157   /* Create the destination vector  */
7158   tree scalar_dest = gimple_assign_lhs (stmt);
7159   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7160
7161   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7162                      single_defuse_cycle && reduc_index == 0
7163                      ? NULL_TREE : ops[0], &vec_oprnds0,
7164                      single_defuse_cycle && reduc_index == 1
7165                      ? NULL_TREE : ops[1], &vec_oprnds1,
7166                      op_type == ternary_op
7167                      && !(single_defuse_cycle && reduc_index == 2)
7168                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7169   if (single_defuse_cycle)
7170     {
7171       gcc_assert (!slp_node);
7172       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7173                                      ops[reduc_index],
7174                                      reduc_index == 0 ? &vec_oprnds0
7175                                      : (reduc_index == 1 ? &vec_oprnds1
7176                                         : &vec_oprnds2));
7177     }
7178
7179   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7180     {
7181       gimple *new_stmt;
7182       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7183       if (masked_loop_p && !mask_by_cond_expr)
7184         {
7185           /* Make sure that the reduction accumulator is vop[0].  */
7186           if (reduc_index == 1)
7187             {
7188               gcc_assert (commutative_tree_code (code));
7189               std::swap (vop[0], vop[1]);
7190             }
7191           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7192                                           vectype_in, i);
7193           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7194                                                     vop[0], vop[1], vop[0]);
7195           new_temp = make_ssa_name (vec_dest, call);
7196           gimple_call_set_lhs (call, new_temp);
7197           gimple_call_set_nothrow (call, true);
7198           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7199           new_stmt = call;
7200         }
7201       else
7202         {
7203           if (op_type == ternary_op)
7204             vop[2] = vec_oprnds2[i];
7205
7206           if (masked_loop_p && mask_by_cond_expr)
7207             {
7208               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7209                                               vectype_in, i);
7210               build_vect_cond_expr (code, vop, mask, gsi);
7211             }
7212
7213           new_stmt = gimple_build_assign (vec_dest, code,
7214                                           vop[0], vop[1], vop[2]);
7215           new_temp = make_ssa_name (vec_dest, new_stmt);
7216           gimple_assign_set_lhs (new_stmt, new_temp);
7217           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7218         }
7219
7220       if (slp_node)
7221         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7222       else if (single_defuse_cycle
7223                && i < ncopies - 1)
7224         {
7225           if (reduc_index == 0)
7226             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7227           else if (reduc_index == 1)
7228             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7229           else if (reduc_index == 2)
7230             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7231         }
7232       else
7233         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7234     }
7235
7236   if (!slp_node)
7237     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7238
7239   return true;
7240 }
7241
7242 /* Transform phase of a cycle PHI.  */
7243
7244 bool
7245 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7246                           stmt_vec_info stmt_info, gimple **vec_stmt,
7247                           slp_tree slp_node, slp_instance slp_node_instance)
7248 {
7249   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7250   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7251   int i;
7252   int ncopies;
7253   int j;
7254   bool nested_cycle = false;
7255   int vec_num;
7256
7257   if (nested_in_vect_loop_p (loop, stmt_info))
7258     {
7259       loop = loop->inner;
7260       nested_cycle = true;
7261     }
7262
7263   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7264   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7265   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7266   gcc_assert (reduc_info->is_reduc_info);
7267
7268   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7269       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7270     /* Leave the scalar phi in place.  */
7271     return true;
7272
7273   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7274   /* For a nested cycle we do not fill the above.  */
7275   if (!vectype_in)
7276     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7277   gcc_assert (vectype_in);
7278
7279   if (slp_node)
7280     {
7281       /* The size vect_schedule_slp_instance computes is off for us.  */
7282       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7283                                       * SLP_TREE_LANES (slp_node), vectype_in);
7284       ncopies = 1;
7285     }
7286   else
7287     {
7288       vec_num = 1;
7289       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7290     }
7291
7292   /* Check whether we should use a single PHI node and accumulate
7293      vectors to one before the backedge.  */
7294   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7295     ncopies = 1;
7296
7297   /* Create the destination vector  */
7298   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7299   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7300                                                vectype_out);
7301
7302   /* Get the loop-entry arguments.  */
7303   tree vec_initial_def;
7304   auto_vec<tree> vec_initial_defs;
7305   if (slp_node)
7306     {
7307       vec_initial_defs.reserve (vec_num);
7308       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7309       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7310       tree neutral_op
7311         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7312                                         STMT_VINFO_REDUC_CODE (reduc_info),
7313                                         first != NULL);
7314       get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7315                                       &vec_initial_defs, vec_num,
7316                                       first != NULL, neutral_op);
7317     }
7318   else
7319     {
7320       /* Get at the scalar def before the loop, that defines the initial
7321          value of the reduction variable.  */
7322       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7323                                                 loop_preheader_edge (loop));
7324       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7325          and we can't use zero for induc_val, use initial_def.  Similarly
7326          for REDUC_MIN and initial_def larger than the base.  */
7327       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7328         {
7329           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7330           if (TREE_CODE (initial_def) == INTEGER_CST
7331               && !integer_zerop (induc_val)
7332               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7333                    && tree_int_cst_lt (initial_def, induc_val))
7334                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7335                       && tree_int_cst_lt (induc_val, initial_def))))
7336             {
7337               induc_val = initial_def;
7338               /* Communicate we used the initial_def to epilouge
7339                  generation.  */
7340               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7341             }
7342           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7343           vec_initial_defs.create (ncopies);
7344           for (i = 0; i < ncopies; ++i)
7345             vec_initial_defs.quick_push (vec_initial_def);
7346         }
7347       else if (nested_cycle)
7348         {
7349           /* Do not use an adjustment def as that case is not supported
7350              correctly if ncopies is not one.  */
7351           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7352                                          ncopies, initial_def,
7353                                          &vec_initial_defs);
7354         }
7355       else
7356         {
7357           tree adjustment_def = NULL_TREE;
7358           tree *adjustment_defp = &adjustment_def;
7359           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7360           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7361             adjustment_defp = NULL;
7362           vec_initial_def
7363             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7364                                              initial_def, adjustment_defp);
7365           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7366           vec_initial_defs.create (ncopies);
7367           for (i = 0; i < ncopies; ++i)
7368             vec_initial_defs.quick_push (vec_initial_def);
7369         }
7370     }
7371
7372   /* Generate the reduction PHIs upfront.  */
7373   for (i = 0; i < vec_num; i++)
7374     {
7375       tree vec_init_def = vec_initial_defs[i];
7376       for (j = 0; j < ncopies; j++)
7377         {
7378           /* Create the reduction-phi that defines the reduction
7379              operand.  */
7380           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7381
7382           /* Set the loop-entry arg of the reduction-phi.  */
7383           if (j != 0 && nested_cycle)
7384             vec_init_def = vec_initial_defs[j];
7385           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7386                        UNKNOWN_LOCATION);
7387
7388           /* The loop-latch arg is set in epilogue processing.  */
7389
7390           if (slp_node)
7391             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7392           else
7393             {
7394               if (j == 0)
7395                 *vec_stmt = new_phi;
7396               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7397             }
7398         }
7399     }
7400
7401   return true;
7402 }
7403
7404 /* Vectorizes LC PHIs.  */
7405
7406 bool
7407 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7408                      stmt_vec_info stmt_info, gimple **vec_stmt,
7409                      slp_tree slp_node)
7410 {
7411   if (!loop_vinfo
7412       || !is_a <gphi *> (stmt_info->stmt)
7413       || gimple_phi_num_args (stmt_info->stmt) != 1)
7414     return false;
7415
7416   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7417       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7418     return false;
7419
7420   if (!vec_stmt) /* transformation not required.  */
7421     {
7422       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7423       return true;
7424     }
7425
7426   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7427   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7428   basic_block bb = gimple_bb (stmt_info->stmt);
7429   edge e = single_pred_edge (bb);
7430   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7431   auto_vec<tree> vec_oprnds;
7432   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7433                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7434                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7435   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7436     {
7437       /* Create the vectorized LC PHI node.  */
7438       gphi *new_phi = create_phi_node (vec_dest, bb);
7439       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7440       if (slp_node)
7441         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7442       else
7443         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7444     }
7445   if (!slp_node)
7446     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7447
7448   return true;
7449 }
7450
7451
7452 /* Function vect_min_worthwhile_factor.
7453
7454    For a loop where we could vectorize the operation indicated by CODE,
7455    return the minimum vectorization factor that makes it worthwhile
7456    to use generic vectors.  */
7457 static unsigned int
7458 vect_min_worthwhile_factor (enum tree_code code)
7459 {
7460   switch (code)
7461     {
7462     case PLUS_EXPR:
7463     case MINUS_EXPR:
7464     case NEGATE_EXPR:
7465       return 4;
7466
7467     case BIT_AND_EXPR:
7468     case BIT_IOR_EXPR:
7469     case BIT_XOR_EXPR:
7470     case BIT_NOT_EXPR:
7471       return 2;
7472
7473     default:
7474       return INT_MAX;
7475     }
7476 }
7477
7478 /* Return true if VINFO indicates we are doing loop vectorization and if
7479    it is worth decomposing CODE operations into scalar operations for
7480    that loop's vectorization factor.  */
7481
7482 bool
7483 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7484 {
7485   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7486   unsigned HOST_WIDE_INT value;
7487   return (loop_vinfo
7488           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7489           && value >= vect_min_worthwhile_factor (code));
7490 }
7491
7492 /* Function vectorizable_induction
7493
7494    Check if STMT_INFO performs an induction computation that can be vectorized.
7495    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7496    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7497    Return true if STMT_INFO is vectorizable in this way.  */
7498
7499 bool
7500 vectorizable_induction (loop_vec_info loop_vinfo,
7501                         stmt_vec_info stmt_info,
7502                         gimple **vec_stmt, slp_tree slp_node,
7503                         stmt_vector_for_cost *cost_vec)
7504 {
7505   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7506   unsigned ncopies;
7507   bool nested_in_vect_loop = false;
7508   class loop *iv_loop;
7509   tree vec_def;
7510   edge pe = loop_preheader_edge (loop);
7511   basic_block new_bb;
7512   tree new_vec, vec_init, vec_step, t;
7513   tree new_name;
7514   gimple *new_stmt;
7515   gphi *induction_phi;
7516   tree induc_def, vec_dest;
7517   tree init_expr, step_expr;
7518   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7519   unsigned i;
7520   tree expr;
7521   gimple_seq stmts;
7522   gimple_stmt_iterator si;
7523
7524   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7525   if (!phi)
7526     return false;
7527
7528   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7529     return false;
7530
7531   /* Make sure it was recognized as induction computation.  */
7532   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7533     return false;
7534
7535   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7536   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7537
7538   if (slp_node)
7539     ncopies = 1;
7540   else
7541     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7542   gcc_assert (ncopies >= 1);
7543
7544   /* FORNOW. These restrictions should be relaxed.  */
7545   if (nested_in_vect_loop_p (loop, stmt_info))
7546     {
7547       imm_use_iterator imm_iter;
7548       use_operand_p use_p;
7549       gimple *exit_phi;
7550       edge latch_e;
7551       tree loop_arg;
7552
7553       if (ncopies > 1)
7554         {
7555           if (dump_enabled_p ())
7556             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7557                              "multiple types in nested loop.\n");
7558           return false;
7559         }
7560
7561       /* FORNOW: outer loop induction with SLP not supported.  */
7562       if (STMT_SLP_TYPE (stmt_info))
7563         return false;
7564
7565       exit_phi = NULL;
7566       latch_e = loop_latch_edge (loop->inner);
7567       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7568       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7569         {
7570           gimple *use_stmt = USE_STMT (use_p);
7571           if (is_gimple_debug (use_stmt))
7572             continue;
7573
7574           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7575             {
7576               exit_phi = use_stmt;
7577               break;
7578             }
7579         }
7580       if (exit_phi)
7581         {
7582           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7583           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7584                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7585             {
7586               if (dump_enabled_p ())
7587                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7588                                  "inner-loop induction only used outside "
7589                                  "of the outer vectorized loop.\n");
7590               return false;
7591             }
7592         }
7593
7594       nested_in_vect_loop = true;
7595       iv_loop = loop->inner;
7596     }
7597   else
7598     iv_loop = loop;
7599   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7600
7601   if (slp_node && !nunits.is_constant ())
7602     {
7603       /* The current SLP code creates the initial value element-by-element.  */
7604       if (dump_enabled_p ())
7605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7606                          "SLP induction not supported for variable-length"
7607                          " vectors.\n");
7608       return false;
7609     }
7610
7611   if (!vec_stmt) /* transformation not required.  */
7612     {
7613       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7614       DUMP_VECT_SCOPE ("vectorizable_induction");
7615       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7616       return true;
7617     }
7618
7619   /* Transform.  */
7620
7621   /* Compute a vector variable, initialized with the first VF values of
7622      the induction variable.  E.g., for an iv with IV_PHI='X' and
7623      evolution S, for a vector of 4 units, we want to compute:
7624      [X, X + S, X + 2*S, X + 3*S].  */
7625
7626   if (dump_enabled_p ())
7627     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7628
7629   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7630   gcc_assert (step_expr != NULL_TREE);
7631   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7632
7633   pe = loop_preheader_edge (iv_loop);
7634   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7635                                      loop_preheader_edge (iv_loop));
7636
7637   stmts = NULL;
7638   if (!nested_in_vect_loop)
7639     {
7640       /* Convert the initial value to the IV update type.  */
7641       tree new_type = TREE_TYPE (step_expr);
7642       init_expr = gimple_convert (&stmts, new_type, init_expr);
7643
7644       /* If we are using the loop mask to "peel" for alignment then we need
7645          to adjust the start value here.  */
7646       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7647       if (skip_niters != NULL_TREE)
7648         {
7649           if (FLOAT_TYPE_P (vectype))
7650             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7651                                         skip_niters);
7652           else
7653             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7654           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7655                                          skip_niters, step_expr);
7656           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7657                                     init_expr, skip_step);
7658         }
7659     }
7660
7661   if (stmts)
7662     {
7663       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7664       gcc_assert (!new_bb);
7665     }
7666
7667   /* Find the first insertion point in the BB.  */
7668   basic_block bb = gimple_bb (phi);
7669   si = gsi_after_labels (bb);
7670
7671   /* For SLP induction we have to generate several IVs as for example
7672      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7673      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7674      [VF*S, VF*S, VF*S, VF*S] for all.  */
7675   if (slp_node)
7676     {
7677       /* Enforced above.  */
7678       unsigned int const_nunits = nunits.to_constant ();
7679
7680       /* Generate [VF*S, VF*S, ... ].  */
7681       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7682         {
7683           expr = build_int_cst (integer_type_node, vf);
7684           expr = fold_convert (TREE_TYPE (step_expr), expr);
7685         }
7686       else
7687         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7688       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7689                               expr, step_expr);
7690       if (! CONSTANT_CLASS_P (new_name))
7691         new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7692                                      TREE_TYPE (step_expr), NULL);
7693       new_vec = build_vector_from_val (step_vectype, new_name);
7694       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7695                                    new_vec, step_vectype, NULL);
7696
7697       /* Now generate the IVs.  */
7698       unsigned group_size = SLP_TREE_LANES (slp_node);
7699       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7700       unsigned elts = const_nunits * nvects;
7701       /* Compute the number of distinct IVs we need.  First reduce
7702          group_size if it is a multiple of const_nunits so we get
7703          one IV for a group_size of 4 but const_nunits 2.  */
7704       unsigned group_sizep = group_size;
7705       if (group_sizep % const_nunits == 0)
7706         group_sizep = group_sizep / const_nunits;
7707       unsigned nivs = least_common_multiple (group_sizep,
7708                                              const_nunits) / const_nunits;
7709       gcc_assert (elts % group_size == 0);
7710       tree elt = init_expr;
7711       unsigned ivn;
7712       for (ivn = 0; ivn < nivs; ++ivn)
7713         {
7714           tree_vector_builder elts (step_vectype, const_nunits, 1);
7715           stmts = NULL;
7716           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7717             {
7718               if (ivn*const_nunits + eltn >= group_size
7719                   && (ivn * const_nunits + eltn) % group_size == 0)
7720                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7721                                     elt, step_expr);
7722               elts.quick_push (elt);
7723             }
7724           vec_init = gimple_build_vector (&stmts, &elts);
7725           vec_init = gimple_convert (&stmts, vectype, vec_init);
7726           if (stmts)
7727             {
7728               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7729               gcc_assert (!new_bb);
7730             }
7731
7732           /* Create the induction-phi that defines the induction-operand.  */
7733           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7734           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7735           induc_def = PHI_RESULT (induction_phi);
7736
7737           /* Create the iv update inside the loop  */
7738           gimple_seq stmts = NULL;
7739           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7740           vec_def = gimple_build (&stmts,
7741                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7742           vec_def = gimple_convert (&stmts, vectype, vec_def);
7743           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7744
7745           /* Set the arguments of the phi node:  */
7746           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7747           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7748                        UNKNOWN_LOCATION);
7749
7750           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7751         }
7752       /* Fill up to the number of vectors we need for the whole group.  */
7753       nivs = least_common_multiple (group_size,
7754                                     const_nunits) / const_nunits;
7755       for (; ivn < nivs; ++ivn)
7756         SLP_TREE_VEC_STMTS (slp_node)
7757           .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7758
7759       /* Re-use IVs when we can.  */
7760       if (ivn < nvects)
7761         {
7762           unsigned vfp
7763             = least_common_multiple (group_size, const_nunits) / group_size;
7764           /* Generate [VF'*S, VF'*S, ... ].  */
7765           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7766             {
7767               expr = build_int_cst (integer_type_node, vfp);
7768               expr = fold_convert (TREE_TYPE (step_expr), expr);
7769             }
7770           else
7771             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7772           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7773                                   expr, step_expr);
7774           if (! CONSTANT_CLASS_P (new_name))
7775             new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7776                                          TREE_TYPE (step_expr), NULL);
7777           new_vec = build_vector_from_val (step_vectype, new_name);
7778           vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7779                                        step_vectype, NULL);
7780           for (; ivn < nvects; ++ivn)
7781             {
7782               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7783               tree def;
7784               if (gimple_code (iv) == GIMPLE_PHI)
7785                 def = gimple_phi_result (iv);
7786               else
7787                 def = gimple_assign_lhs (iv);
7788               gimple_seq stmts = NULL;
7789               def = gimple_convert (&stmts, step_vectype, def);
7790               def = gimple_build (&stmts,
7791                                   PLUS_EXPR, step_vectype, def, vec_step);
7792               def = gimple_convert (&stmts, vectype, def);
7793               if (gimple_code (iv) == GIMPLE_PHI)
7794                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7795               else
7796                 {
7797                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7798                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7799                 }
7800               SLP_TREE_VEC_STMTS (slp_node)
7801                 .quick_push (SSA_NAME_DEF_STMT (def));
7802             }
7803         }
7804
7805       return true;
7806     }
7807
7808   /* Create the vector that holds the initial_value of the induction.  */
7809   if (nested_in_vect_loop)
7810     {
7811       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7812          been created during vectorization of previous stmts.  We obtain it
7813          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7814       auto_vec<tree> vec_inits;
7815       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7816                                      init_expr, &vec_inits);
7817       vec_init = vec_inits[0];
7818       /* If the initial value is not of proper type, convert it.  */
7819       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7820         {
7821           new_stmt
7822             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7823                                                           vect_simple_var,
7824                                                           "vec_iv_"),
7825                                    VIEW_CONVERT_EXPR,
7826                                    build1 (VIEW_CONVERT_EXPR, vectype,
7827                                            vec_init));
7828           vec_init = gimple_assign_lhs (new_stmt);
7829           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7830                                                  new_stmt);
7831           gcc_assert (!new_bb);
7832         }
7833     }
7834   else
7835     {
7836       /* iv_loop is the loop to be vectorized. Create:
7837          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7838       stmts = NULL;
7839       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7840
7841       unsigned HOST_WIDE_INT const_nunits;
7842       if (nunits.is_constant (&const_nunits))
7843         {
7844           tree_vector_builder elts (step_vectype, const_nunits, 1);
7845           elts.quick_push (new_name);
7846           for (i = 1; i < const_nunits; i++)
7847             {
7848               /* Create: new_name_i = new_name + step_expr  */
7849               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7850                                        new_name, step_expr);
7851               elts.quick_push (new_name);
7852             }
7853           /* Create a vector from [new_name_0, new_name_1, ...,
7854              new_name_nunits-1]  */
7855           vec_init = gimple_build_vector (&stmts, &elts);
7856         }
7857       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7858         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7859         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7860                                  new_name, step_expr);
7861       else
7862         {
7863           /* Build:
7864                 [base, base, base, ...]
7865                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7866           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7867           gcc_assert (flag_associative_math);
7868           tree index = build_index_vector (step_vectype, 0, 1);
7869           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7870                                                         new_name);
7871           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7872                                                         step_expr);
7873           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7874           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7875                                    vec_init, step_vec);
7876           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7877                                    vec_init, base_vec);
7878         }
7879       vec_init = gimple_convert (&stmts, vectype, vec_init);
7880
7881       if (stmts)
7882         {
7883           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7884           gcc_assert (!new_bb);
7885         }
7886     }
7887
7888
7889   /* Create the vector that holds the step of the induction.  */
7890   if (nested_in_vect_loop)
7891     /* iv_loop is nested in the loop to be vectorized. Generate:
7892        vec_step = [S, S, S, S]  */
7893     new_name = step_expr;
7894   else
7895     {
7896       /* iv_loop is the loop to be vectorized. Generate:
7897           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7898       gimple_seq seq = NULL;
7899       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7900         {
7901           expr = build_int_cst (integer_type_node, vf);
7902           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7903         }
7904       else
7905         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7906       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7907                                expr, step_expr);
7908       if (seq)
7909         {
7910           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7911           gcc_assert (!new_bb);
7912         }
7913     }
7914
7915   t = unshare_expr (new_name);
7916   gcc_assert (CONSTANT_CLASS_P (new_name)
7917               || TREE_CODE (new_name) == SSA_NAME);
7918   new_vec = build_vector_from_val (step_vectype, t);
7919   vec_step = vect_init_vector (loop_vinfo, stmt_info,
7920                                new_vec, step_vectype, NULL);
7921
7922
7923   /* Create the following def-use cycle:
7924      loop prolog:
7925          vec_init = ...
7926          vec_step = ...
7927      loop:
7928          vec_iv = PHI <vec_init, vec_loop>
7929          ...
7930          STMT
7931          ...
7932          vec_loop = vec_iv + vec_step;  */
7933
7934   /* Create the induction-phi that defines the induction-operand.  */
7935   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7936   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7937   induc_def = PHI_RESULT (induction_phi);
7938
7939   /* Create the iv update inside the loop  */
7940   stmts = NULL;
7941   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7942   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7943   vec_def = gimple_convert (&stmts, vectype, vec_def);
7944   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7945   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7946
7947   /* Set the arguments of the phi node:  */
7948   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7949   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7950                UNKNOWN_LOCATION);
7951
7952   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7953   *vec_stmt = induction_phi;
7954
7955   /* In case that vectorization factor (VF) is bigger than the number
7956      of elements that we can fit in a vectype (nunits), we have to generate
7957      more than one vector stmt - i.e - we need to "unroll" the
7958      vector stmt by a factor VF/nunits.  For more details see documentation
7959      in vectorizable_operation.  */
7960
7961   if (ncopies > 1)
7962     {
7963       gimple_seq seq = NULL;
7964       /* FORNOW. This restriction should be relaxed.  */
7965       gcc_assert (!nested_in_vect_loop);
7966
7967       /* Create the vector that holds the step of the induction.  */
7968       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7969         {
7970           expr = build_int_cst (integer_type_node, nunits);
7971           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7972         }
7973       else
7974         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7975       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7976                                expr, step_expr);
7977       if (seq)
7978         {
7979           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7980           gcc_assert (!new_bb);
7981         }
7982
7983       t = unshare_expr (new_name);
7984       gcc_assert (CONSTANT_CLASS_P (new_name)
7985                   || TREE_CODE (new_name) == SSA_NAME);
7986       new_vec = build_vector_from_val (step_vectype, t);
7987       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7988                                    new_vec, step_vectype, NULL);
7989
7990       vec_def = induc_def;
7991       for (i = 1; i < ncopies; i++)
7992         {
7993           /* vec_i = vec_prev + vec_step  */
7994           gimple_seq stmts = NULL;
7995           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7996           vec_def = gimple_build (&stmts,
7997                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7998           vec_def = gimple_convert (&stmts, vectype, vec_def);
7999
8000           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8001           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8002           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8003         }
8004     }
8005
8006   if (dump_enabled_p ())
8007     dump_printf_loc (MSG_NOTE, vect_location,
8008                      "transform induction: created def-use cycle: %G%G",
8009                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8010
8011   return true;
8012 }
8013
8014 /* Function vectorizable_live_operation.
8015
8016    STMT_INFO computes a value that is used outside the loop.  Check if
8017    it can be supported.  */
8018
8019 bool
8020 vectorizable_live_operation (loop_vec_info loop_vinfo,
8021                              stmt_vec_info stmt_info,
8022                              gimple_stmt_iterator *gsi,
8023                              slp_tree slp_node, slp_instance slp_node_instance,
8024                              int slp_index, bool vec_stmt_p,
8025                              stmt_vector_for_cost *)
8026 {
8027   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8028   imm_use_iterator imm_iter;
8029   tree lhs, lhs_type, bitsize, vec_bitsize;
8030   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8031   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8032   int ncopies;
8033   gimple *use_stmt;
8034   auto_vec<tree> vec_oprnds;
8035   int vec_entry = 0;
8036   poly_uint64 vec_index = 0;
8037
8038   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8039
8040   /* If a stmt of a reduction is live, vectorize it via
8041      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8042      validity so just trigger the transform here.  */
8043   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8044     {
8045       if (!vec_stmt_p)
8046         return true;
8047       if (slp_node)
8048         {
8049           /* For reduction chains the meta-info is attached to
8050              the group leader.  */
8051           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8052             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8053           /* For SLP reductions we vectorize the epilogue for
8054              all involved stmts together.  */
8055           else if (slp_index != 0)
8056             return true;
8057           else
8058             /* For SLP reductions the meta-info is attached to
8059                the representative.  */
8060             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8061         }
8062       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8063       gcc_assert (reduc_info->is_reduc_info);
8064       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8065           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8066         return true;
8067       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8068                                         slp_node_instance);
8069       return true;
8070     }
8071
8072   /* FORNOW.  CHECKME.  */
8073   if (nested_in_vect_loop_p (loop, stmt_info))
8074     return false;
8075
8076   /* If STMT is not relevant and it is a simple assignment and its inputs are
8077      invariant then it can remain in place, unvectorized.  The original last
8078      scalar value that it computes will be used.  */
8079   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8080     {
8081       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8082       if (dump_enabled_p ())
8083         dump_printf_loc (MSG_NOTE, vect_location,
8084                          "statement is simple and uses invariant.  Leaving in "
8085                          "place.\n");
8086       return true;
8087     }
8088
8089   if (slp_node)
8090     ncopies = 1;
8091   else
8092     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8093
8094   if (slp_node)
8095     {
8096       gcc_assert (slp_index >= 0);
8097
8098       int num_scalar = SLP_TREE_LANES (slp_node);
8099       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8100
8101       /* Get the last occurrence of the scalar index from the concatenation of
8102          all the slp vectors. Calculate which slp vector it is and the index
8103          within.  */
8104       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8105
8106       /* Calculate which vector contains the result, and which lane of
8107          that vector we need.  */
8108       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8109         {
8110           if (dump_enabled_p ())
8111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112                              "Cannot determine which vector holds the"
8113                              " final result.\n");
8114           return false;
8115         }
8116     }
8117
8118   if (!vec_stmt_p)
8119     {
8120       /* No transformation required.  */
8121       if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8122         {
8123           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8124                                                OPTIMIZE_FOR_SPEED))
8125             {
8126               if (dump_enabled_p ())
8127                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128                                  "can't operate on partial vectors "
8129                                  "because the target doesn't support extract "
8130                                  "last reduction.\n");
8131               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8132             }
8133           else if (slp_node)
8134             {
8135               if (dump_enabled_p ())
8136                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8137                                  "can't operate on partial vectors "
8138                                  "because an SLP statement is live after "
8139                                  "the loop.\n");
8140               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8141             }
8142           else if (ncopies > 1)
8143             {
8144               if (dump_enabled_p ())
8145                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8146                                  "can't operate on partial vectors "
8147                                  "because ncopies is greater than 1.\n");
8148               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8149             }
8150           else
8151             {
8152               gcc_assert (ncopies == 1 && !slp_node);
8153               vect_record_loop_mask (loop_vinfo,
8154                                      &LOOP_VINFO_MASKS (loop_vinfo),
8155                                      1, vectype, NULL);
8156             }
8157         }
8158       return true;
8159     }
8160
8161   /* Use the lhs of the original scalar statement.  */
8162   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8163
8164   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8165         : gimple_get_lhs (stmt);
8166   lhs_type = TREE_TYPE (lhs);
8167
8168   bitsize = vector_element_bits_tree (vectype);
8169   vec_bitsize = TYPE_SIZE (vectype);
8170
8171   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8172   tree vec_lhs, bitstart;
8173   if (slp_node)
8174     {
8175       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8176
8177       /* Get the correct slp vectorized stmt.  */
8178       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8179       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8180         vec_lhs = gimple_phi_result (phi);
8181       else
8182         vec_lhs = gimple_get_lhs (vec_stmt);
8183
8184       /* Get entry to use.  */
8185       bitstart = bitsize_int (vec_index);
8186       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8187     }
8188   else
8189     {
8190       /* For multiple copies, get the last copy.  */
8191       vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
8192
8193       /* Get the last lane in the vector.  */
8194       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8195     }
8196
8197   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8198      requirement, insert one phi node for it.  It looks like:
8199          loop;
8200        BB:
8201          # lhs' = PHI <lhs>
8202      ==>
8203          loop;
8204        BB:
8205          # vec_lhs' = PHI <vec_lhs>
8206          new_tree = lane_extract <vec_lhs', ...>;
8207          lhs' = new_tree;  */
8208
8209   basic_block exit_bb = single_exit (loop)->dest;
8210   gcc_assert (single_pred_p (exit_bb));
8211
8212   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8213   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8214   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8215
8216   gimple_seq stmts = NULL;
8217   tree new_tree;
8218   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8219     {
8220       /* Emit:
8221
8222            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8223
8224          where VEC_LHS is the vectorized live-out result and MASK is
8225          the loop mask for the final iteration.  */
8226       gcc_assert (ncopies == 1 && !slp_node);
8227       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8228       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8229                                       vectype, 0);
8230       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8231                                       mask, vec_lhs_phi);
8232
8233       /* Convert the extracted vector element to the required scalar type.  */
8234       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8235     }
8236   else
8237     {
8238       tree bftype = TREE_TYPE (vectype);
8239       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8240         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8241       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8242       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8243                                        &stmts, true, NULL_TREE);
8244     }
8245
8246   if (stmts)
8247     {
8248       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8249       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8250
8251       /* Remove existing phi from lhs and create one copy from new_tree.  */
8252       tree lhs_phi = NULL_TREE;
8253       gimple_stmt_iterator gsi;
8254       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8255         {
8256           gimple *phi = gsi_stmt (gsi);
8257           if ((gimple_phi_arg_def (phi, 0) == lhs))
8258             {
8259               remove_phi_node (&gsi, false);
8260               lhs_phi = gimple_phi_result (phi);
8261               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8262               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8263               break;
8264             }
8265         }
8266     }
8267
8268   /* Replace use of lhs with newly computed result.  If the use stmt is a
8269      single arg PHI, just replace all uses of PHI result.  It's necessary
8270      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8271   use_operand_p use_p;
8272   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8273     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8274         && !is_gimple_debug (use_stmt))
8275     {
8276       if (gimple_code (use_stmt) == GIMPLE_PHI
8277           && gimple_phi_num_args (use_stmt) == 1)
8278         {
8279           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8280         }
8281       else
8282         {
8283           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8284             SET_USE (use_p, new_tree);
8285         }
8286       update_stmt (use_stmt);
8287     }
8288
8289   return true;
8290 }
8291
8292 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8293
8294 static void
8295 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8296 {
8297   ssa_op_iter op_iter;
8298   imm_use_iterator imm_iter;
8299   def_operand_p def_p;
8300   gimple *ustmt;
8301
8302   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8303     {
8304       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8305         {
8306           basic_block bb;
8307
8308           if (!is_gimple_debug (ustmt))
8309             continue;
8310
8311           bb = gimple_bb (ustmt);
8312
8313           if (!flow_bb_inside_loop_p (loop, bb))
8314             {
8315               if (gimple_debug_bind_p (ustmt))
8316                 {
8317                   if (dump_enabled_p ())
8318                     dump_printf_loc (MSG_NOTE, vect_location,
8319                                      "killing debug use\n");
8320
8321                   gimple_debug_bind_reset_value (ustmt);
8322                   update_stmt (ustmt);
8323                 }
8324               else
8325                 gcc_unreachable ();
8326             }
8327         }
8328     }
8329 }
8330
8331 /* Given loop represented by LOOP_VINFO, return true if computation of
8332    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8333    otherwise.  */
8334
8335 static bool
8336 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8337 {
8338   /* Constant case.  */
8339   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8340     {
8341       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8342       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8343
8344       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8345       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8346       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8347         return true;
8348     }
8349
8350   widest_int max;
8351   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8352   /* Check the upper bound of loop niters.  */
8353   if (get_max_loop_iterations (loop, &max))
8354     {
8355       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8356       signop sgn = TYPE_SIGN (type);
8357       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8358       if (max < type_max)
8359         return true;
8360     }
8361   return false;
8362 }
8363
8364 /* Return a mask type with half the number of elements as OLD_TYPE,
8365    given that it should have mode NEW_MODE.  */
8366
8367 tree
8368 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8369 {
8370   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8371   return build_truth_vector_type_for_mode (nunits, new_mode);
8372 }
8373
8374 /* Return a mask type with twice as many elements as OLD_TYPE,
8375    given that it should have mode NEW_MODE.  */
8376
8377 tree
8378 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8379 {
8380   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8381   return build_truth_vector_type_for_mode (nunits, new_mode);
8382 }
8383
8384 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8385    contain a sequence of NVECTORS masks that each control a vector of type
8386    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8387    these vector masks with the vector version of SCALAR_MASK.  */
8388
8389 void
8390 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8391                        unsigned int nvectors, tree vectype, tree scalar_mask)
8392 {
8393   gcc_assert (nvectors != 0);
8394   if (masks->length () < nvectors)
8395     masks->safe_grow_cleared (nvectors);
8396   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8397   /* The number of scalars per iteration and the number of vectors are
8398      both compile-time constants.  */
8399   unsigned int nscalars_per_iter
8400     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8401                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8402
8403   if (scalar_mask)
8404     {
8405       scalar_cond_masked_key cond (scalar_mask, nvectors);
8406       loop_vinfo->scalar_cond_masked_set.add (cond);
8407     }
8408
8409   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8410     {
8411       rgm->max_nscalars_per_iter = nscalars_per_iter;
8412       rgm->type = truth_type_for (vectype);
8413       rgm->factor = 1;
8414     }
8415 }
8416
8417 /* Given a complete set of masks MASKS, extract mask number INDEX
8418    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8419    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8420
8421    See the comment above vec_loop_masks for more details about the mask
8422    arrangement.  */
8423
8424 tree
8425 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8426                     unsigned int nvectors, tree vectype, unsigned int index)
8427 {
8428   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8429   tree mask_type = rgm->type;
8430
8431   /* Populate the rgroup's mask array, if this is the first time we've
8432      used it.  */
8433   if (rgm->controls.is_empty ())
8434     {
8435       rgm->controls.safe_grow_cleared (nvectors);
8436       for (unsigned int i = 0; i < nvectors; ++i)
8437         {
8438           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8439           /* Provide a dummy definition until the real one is available.  */
8440           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8441           rgm->controls[i] = mask;
8442         }
8443     }
8444
8445   tree mask = rgm->controls[index];
8446   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8447                 TYPE_VECTOR_SUBPARTS (vectype)))
8448     {
8449       /* A loop mask for data type X can be reused for data type Y
8450          if X has N times more elements than Y and if Y's elements
8451          are N times bigger than X's.  In this case each sequence
8452          of N elements in the loop mask will be all-zero or all-one.
8453          We can then view-convert the mask so that each sequence of
8454          N elements is replaced by a single element.  */
8455       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8456                               TYPE_VECTOR_SUBPARTS (vectype)));
8457       gimple_seq seq = NULL;
8458       mask_type = truth_type_for (vectype);
8459       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8460       if (seq)
8461         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8462     }
8463   return mask;
8464 }
8465
8466 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8467    lengths for controlling an operation on VECTYPE.  The operation splits
8468    each element of VECTYPE into FACTOR separate subelements, measuring the
8469    length as a number of these subelements.  */
8470
8471 void
8472 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8473                       unsigned int nvectors, tree vectype, unsigned int factor)
8474 {
8475   gcc_assert (nvectors != 0);
8476   if (lens->length () < nvectors)
8477     lens->safe_grow_cleared (nvectors);
8478   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8479
8480   /* The number of scalars per iteration, scalar occupied bytes and
8481      the number of vectors are both compile-time constants.  */
8482   unsigned int nscalars_per_iter
8483     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8484                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8485
8486   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8487     {
8488       /* For now, we only support cases in which all loads and stores fall back
8489          to VnQI or none do.  */
8490       gcc_assert (!rgl->max_nscalars_per_iter
8491                   || (rgl->factor == 1 && factor == 1)
8492                   || (rgl->max_nscalars_per_iter * rgl->factor
8493                       == nscalars_per_iter * factor));
8494       rgl->max_nscalars_per_iter = nscalars_per_iter;
8495       rgl->type = vectype;
8496       rgl->factor = factor;
8497     }
8498 }
8499
8500 /* Given a complete set of length LENS, extract length number INDEX for an
8501    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
8502
8503 tree
8504 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8505                    unsigned int nvectors, unsigned int index)
8506 {
8507   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8508
8509   /* Populate the rgroup's len array, if this is the first time we've
8510      used it.  */
8511   if (rgl->controls.is_empty ())
8512     {
8513       rgl->controls.safe_grow_cleared (nvectors);
8514       for (unsigned int i = 0; i < nvectors; ++i)
8515         {
8516           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8517           gcc_assert (len_type != NULL_TREE);
8518           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8519
8520           /* Provide a dummy definition until the real one is available.  */
8521           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8522           rgl->controls[i] = len;
8523         }
8524     }
8525
8526   return rgl->controls[index];
8527 }
8528
8529 /* Scale profiling counters by estimation for LOOP which is vectorized
8530    by factor VF.  */
8531
8532 static void
8533 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8534 {
8535   edge preheader = loop_preheader_edge (loop);
8536   /* Reduce loop iterations by the vectorization factor.  */
8537   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8538   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8539
8540   if (freq_h.nonzero_p ())
8541     {
8542       profile_probability p;
8543
8544       /* Avoid dropping loop body profile counter to 0 because of zero count
8545          in loop's preheader.  */
8546       if (!(freq_e == profile_count::zero ()))
8547         freq_e = freq_e.force_nonzero ();
8548       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8549       scale_loop_frequencies (loop, p);
8550     }
8551
8552   edge exit_e = single_exit (loop);
8553   exit_e->probability = profile_probability::always ()
8554                                  .apply_scale (1, new_est_niter + 1);
8555
8556   edge exit_l = single_pred_edge (loop->latch);
8557   profile_probability prob = exit_l->probability;
8558   exit_l->probability = exit_e->probability.invert ();
8559   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8560     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8561 }
8562
8563 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8564    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8565    stmt_vec_info.  */
8566
8567 static void
8568 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8569                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8570 {
8571   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8572   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8573
8574   if (dump_enabled_p ())
8575     dump_printf_loc (MSG_NOTE, vect_location,
8576                      "------>vectorizing statement: %G", stmt_info->stmt);
8577
8578   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8579     vect_loop_kill_debug_uses (loop, stmt_info);
8580
8581   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8582       && !STMT_VINFO_LIVE_P (stmt_info))
8583     return;
8584
8585   if (STMT_VINFO_VECTYPE (stmt_info))
8586     {
8587       poly_uint64 nunits
8588         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8589       if (!STMT_SLP_TYPE (stmt_info)
8590           && maybe_ne (nunits, vf)
8591           && dump_enabled_p ())
8592         /* For SLP VF is set according to unrolling factor, and not
8593            to vector size, hence for SLP this print is not valid.  */
8594         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8595     }
8596
8597   /* Pure SLP statements have already been vectorized.  We still need
8598      to apply loop vectorization to hybrid SLP statements.  */
8599   if (PURE_SLP_STMT (stmt_info))
8600     return;
8601
8602   if (dump_enabled_p ())
8603     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8604
8605   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8606     *seen_store = stmt_info;
8607 }
8608
8609 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8610    in the hash_map with its corresponding values.  */
8611
8612 static tree
8613 find_in_mapping (tree t, void *context)
8614 {
8615   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8616
8617   tree *value = mapping->get (t);
8618   return value ? *value : t;
8619 }
8620
8621 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8622    original loop that has now been vectorized.
8623
8624    The inits of the data_references need to be advanced with the number of
8625    iterations of the main loop.  This has been computed in vect_do_peeling and
8626    is stored in parameter ADVANCE.  We first restore the data_references
8627    initial offset with the values recored in ORIG_DRS_INIT.
8628
8629    Since the loop_vec_info of this EPILOGUE was constructed for the original
8630    loop, its stmt_vec_infos all point to the original statements.  These need
8631    to be updated to point to their corresponding copies as well as the SSA_NAMES
8632    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8633
8634    The data_reference's connections also need to be updated.  Their
8635    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8636    stmt_vec_infos, their statements need to point to their corresponding copy,
8637    if they are gather loads or scatter stores then their reference needs to be
8638    updated to point to its corresponding copy and finally we set
8639    'base_misaligned' to false as we have already peeled for alignment in the
8640    prologue of the main loop.  */
8641
8642 static void
8643 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8644 {
8645   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8646   auto_vec<gimple *> stmt_worklist;
8647   hash_map<tree,tree> mapping;
8648   gimple *orig_stmt, *new_stmt;
8649   gimple_stmt_iterator epilogue_gsi;
8650   gphi_iterator epilogue_phi_gsi;
8651   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8652   basic_block *epilogue_bbs = get_loop_body (epilogue);
8653   unsigned i;
8654
8655   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8656
8657   /* Advance data_reference's with the number of iterations of the previous
8658      loop and its prologue.  */
8659   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8660
8661
8662   /* The EPILOGUE loop is a copy of the original loop so they share the same
8663      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8664      point to the copied statements.  We also create a mapping of all LHS' in
8665      the original loop and all the LHS' in the EPILOGUE and create worklists to
8666      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8667   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8668     {
8669       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8670            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8671         {
8672           new_stmt = epilogue_phi_gsi.phi ();
8673
8674           gcc_assert (gimple_uid (new_stmt) > 0);
8675           stmt_vinfo
8676             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8677
8678           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8679           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8680
8681           mapping.put (gimple_phi_result (orig_stmt),
8682                        gimple_phi_result (new_stmt));
8683           /* PHI nodes can not have patterns or related statements.  */
8684           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8685                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8686         }
8687
8688       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8689            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8690         {
8691           new_stmt = gsi_stmt (epilogue_gsi);
8692           if (is_gimple_debug (new_stmt))
8693             continue;
8694
8695           gcc_assert (gimple_uid (new_stmt) > 0);
8696           stmt_vinfo
8697             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8698
8699           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8700           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8701
8702           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8703             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8704
8705           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8706             {
8707               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8708               for (gimple_stmt_iterator gsi = gsi_start (seq);
8709                    !gsi_end_p (gsi); gsi_next (&gsi))
8710                 stmt_worklist.safe_push (gsi_stmt (gsi));
8711             }
8712
8713           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8714           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8715             {
8716               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8717               stmt_worklist.safe_push (stmt);
8718               /* Set BB such that the assert in
8719                 'get_initial_def_for_reduction' is able to determine that
8720                 the BB of the related stmt is inside this loop.  */
8721               gimple_set_bb (stmt,
8722                              gimple_bb (new_stmt));
8723               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8724               gcc_assert (related_vinfo == NULL
8725                           || related_vinfo == stmt_vinfo);
8726             }
8727         }
8728     }
8729
8730   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8731      using the original main loop and thus need to be updated to refer to the
8732      cloned variables used in the epilogue.  */
8733   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8734     {
8735       gimple *stmt = stmt_worklist[i];
8736       tree *new_op;
8737
8738       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8739         {
8740           tree op = gimple_op (stmt, j);
8741           if ((new_op = mapping.get(op)))
8742             gimple_set_op (stmt, j, *new_op);
8743           else
8744             {
8745               /* PR92429: The last argument of simplify_replace_tree disables
8746                  folding when replacing arguments.  This is required as
8747                  otherwise you might end up with different statements than the
8748                  ones analyzed in vect_loop_analyze, leading to different
8749                  vectorization.  */
8750               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8751                                           &find_in_mapping, &mapping, false);
8752               gimple_set_op (stmt, j, op);
8753             }
8754         }
8755     }
8756
8757   struct data_reference *dr;
8758   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8759   FOR_EACH_VEC_ELT (datarefs, i, dr)
8760     {
8761       orig_stmt = DR_STMT (dr);
8762       gcc_assert (gimple_uid (orig_stmt) > 0);
8763       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8764       /* Data references for gather loads and scatter stores do not use the
8765          updated offset we set using ADVANCE.  Instead we have to make sure the
8766          reference in the data references point to the corresponding copy of
8767          the original in the epilogue.  */
8768       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8769           == VMAT_GATHER_SCATTER)
8770         {
8771           DR_REF (dr)
8772             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8773                                      &find_in_mapping, &mapping);
8774           DR_BASE_ADDRESS (dr)
8775             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8776                                      &find_in_mapping, &mapping);
8777         }
8778       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8779       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8780       /* The vector size of the epilogue is smaller than that of the main loop
8781          so the alignment is either the same or lower. This means the dr will
8782          thus by definition be aligned.  */
8783       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8784     }
8785
8786   epilogue_vinfo->shared->datarefs_copy.release ();
8787   epilogue_vinfo->shared->save_datarefs ();
8788 }
8789
8790 /* Function vect_transform_loop.
8791
8792    The analysis phase has determined that the loop is vectorizable.
8793    Vectorize the loop - created vectorized stmts to replace the scalar
8794    stmts in the loop, and update the loop exit condition.
8795    Returns scalar epilogue loop if any.  */
8796
8797 class loop *
8798 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8799 {
8800   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8801   class loop *epilogue = NULL;
8802   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8803   int nbbs = loop->num_nodes;
8804   int i;
8805   tree niters_vector = NULL_TREE;
8806   tree step_vector = NULL_TREE;
8807   tree niters_vector_mult_vf = NULL_TREE;
8808   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8809   unsigned int lowest_vf = constant_lower_bound (vf);
8810   gimple *stmt;
8811   bool check_profitability = false;
8812   unsigned int th;
8813
8814   DUMP_VECT_SCOPE ("vec_transform_loop");
8815
8816   loop_vinfo->shared->check_datarefs ();
8817
8818   /* Use the more conservative vectorization threshold.  If the number
8819      of iterations is constant assume the cost check has been performed
8820      by our caller.  If the threshold makes all loops profitable that
8821      run at least the (estimated) vectorization factor number of times
8822      checking is pointless, too.  */
8823   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8824   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8825     {
8826       if (dump_enabled_p ())
8827         dump_printf_loc (MSG_NOTE, vect_location,
8828                          "Profitability threshold is %d loop iterations.\n",
8829                          th);
8830       check_profitability = true;
8831     }
8832
8833   /* Make sure there exists a single-predecessor exit bb.  Do this before
8834      versioning.   */
8835   edge e = single_exit (loop);
8836   if (! single_pred_p (e->dest))
8837     {
8838       split_loop_exit_edge (e, true);
8839       if (dump_enabled_p ())
8840         dump_printf (MSG_NOTE, "split exit edge\n");
8841     }
8842
8843   /* Version the loop first, if required, so the profitability check
8844      comes first.  */
8845
8846   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8847     {
8848       class loop *sloop
8849         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8850       sloop->force_vectorize = false;
8851       check_profitability = false;
8852     }
8853
8854   /* Make sure there exists a single-predecessor exit bb also on the
8855      scalar loop copy.  Do this after versioning but before peeling
8856      so CFG structure is fine for both scalar and if-converted loop
8857      to make slpeel_duplicate_current_defs_from_edges face matched
8858      loop closed PHI nodes on the exit.  */
8859   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8860     {
8861       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8862       if (! single_pred_p (e->dest))
8863         {
8864           split_loop_exit_edge (e, true);
8865           if (dump_enabled_p ())
8866             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8867         }
8868     }
8869
8870   tree niters = vect_build_loop_niters (loop_vinfo);
8871   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8872   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8873   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8874   tree advance;
8875   drs_init_vec orig_drs_init;
8876
8877   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8878                               &step_vector, &niters_vector_mult_vf, th,
8879                               check_profitability, niters_no_overflow,
8880                               &advance);
8881
8882   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8883       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8884     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8885                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8886
8887   if (niters_vector == NULL_TREE)
8888     {
8889       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8890           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
8891           && known_eq (lowest_vf, vf))
8892         {
8893           niters_vector
8894             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8895                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8896           step_vector = build_one_cst (TREE_TYPE (niters));
8897         }
8898       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
8899         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8900                                      &step_vector, niters_no_overflow);
8901       else
8902         /* vect_do_peeling subtracted the number of peeled prologue
8903            iterations from LOOP_VINFO_NITERS.  */
8904         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
8905                                      &niters_vector, &step_vector,
8906                                      niters_no_overflow);
8907     }
8908
8909   /* 1) Make sure the loop header has exactly two entries
8910      2) Make sure we have a preheader basic block.  */
8911
8912   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8913
8914   split_edge (loop_preheader_edge (loop));
8915
8916   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8917       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8918     /* This will deal with any possible peeling.  */
8919     vect_prepare_for_masked_peels (loop_vinfo);
8920
8921   /* Schedule the SLP instances first, then handle loop vectorization
8922      below.  */
8923   if (!loop_vinfo->slp_instances.is_empty ())
8924     {
8925       DUMP_VECT_SCOPE ("scheduling SLP instances");
8926       vect_schedule_slp (loop_vinfo);
8927     }
8928
8929   /* FORNOW: the vectorizer supports only loops which body consist
8930      of one basic block (header + empty latch). When the vectorizer will
8931      support more involved loop forms, the order by which the BBs are
8932      traversed need to be reconsidered.  */
8933
8934   for (i = 0; i < nbbs; i++)
8935     {
8936       basic_block bb = bbs[i];
8937       stmt_vec_info stmt_info;
8938
8939       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8940            gsi_next (&si))
8941         {
8942           gphi *phi = si.phi ();
8943           if (dump_enabled_p ())
8944             dump_printf_loc (MSG_NOTE, vect_location,
8945                              "------>vectorizing phi: %G", phi);
8946           stmt_info = loop_vinfo->lookup_stmt (phi);
8947           if (!stmt_info)
8948             continue;
8949
8950           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8951             vect_loop_kill_debug_uses (loop, stmt_info);
8952
8953           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8954               && !STMT_VINFO_LIVE_P (stmt_info))
8955             continue;
8956
8957           if (STMT_VINFO_VECTYPE (stmt_info)
8958               && (maybe_ne
8959                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8960               && dump_enabled_p ())
8961             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8962
8963           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8964                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8965                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8966                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8967                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8968               && ! PURE_SLP_STMT (stmt_info))
8969             {
8970               if (dump_enabled_p ())
8971                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8972               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8973             }
8974         }
8975
8976       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8977            !gsi_end_p (si);)
8978         {
8979           stmt = gsi_stmt (si);
8980           /* During vectorization remove existing clobber stmts.  */
8981           if (gimple_clobber_p (stmt))
8982             {
8983               unlink_stmt_vdef (stmt);
8984               gsi_remove (&si, true);
8985               release_defs (stmt);
8986             }
8987           else
8988             {
8989               /* Ignore vector stmts created in the outer loop.  */
8990               stmt_info = loop_vinfo->lookup_stmt (stmt);
8991
8992               /* vector stmts created in the outer-loop during vectorization of
8993                  stmts in an inner-loop may not have a stmt_info, and do not
8994                  need to be vectorized.  */
8995               stmt_vec_info seen_store = NULL;
8996               if (stmt_info)
8997                 {
8998                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8999                     {
9000                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9001                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9002                            !gsi_end_p (subsi); gsi_next (&subsi))
9003                         {
9004                           stmt_vec_info pat_stmt_info
9005                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9006                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9007                                                     &si, &seen_store);
9008                         }
9009                       stmt_vec_info pat_stmt_info
9010                         = STMT_VINFO_RELATED_STMT (stmt_info);
9011                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9012                                                 &seen_store);
9013                     }
9014                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9015                                             &seen_store);
9016                 }
9017               gsi_next (&si);
9018               if (seen_store)
9019                 {
9020                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9021                     /* Interleaving.  If IS_STORE is TRUE, the
9022                        vectorization of the interleaving chain was
9023                        completed - free all the stores in the chain.  */
9024                     vect_remove_stores (loop_vinfo,
9025                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9026                   else
9027                     /* Free the attached stmt_vec_info and remove the stmt.  */
9028                     loop_vinfo->remove_stmt (stmt_info);
9029                 }
9030             }
9031         }
9032
9033       /* Stub out scalar statements that must not survive vectorization.
9034          Doing this here helps with grouped statements, or statements that
9035          are involved in patterns.  */
9036       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9037            !gsi_end_p (gsi); gsi_next (&gsi))
9038         {
9039           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9040           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9041             {
9042               tree lhs = gimple_get_lhs (call);
9043               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9044                 {
9045                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9046                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9047                   gsi_replace (&gsi, new_stmt, true);
9048                 }
9049             }
9050         }
9051     }                           /* BBs in loop */
9052
9053   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9054      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9055   if (integer_onep (step_vector))
9056     niters_no_overflow = true;
9057   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9058                            niters_vector_mult_vf, !niters_no_overflow);
9059
9060   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9061   scale_profile_for_vect_loop (loop, assumed_vf);
9062
9063   /* True if the final iteration might not handle a full vector's
9064      worth of scalar iterations.  */
9065   bool final_iter_may_be_partial
9066     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9067   /* The minimum number of iterations performed by the epilogue.  This
9068      is 1 when peeling for gaps because we always need a final scalar
9069      iteration.  */
9070   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9071   /* +1 to convert latch counts to loop iteration counts,
9072      -min_epilogue_iters to remove iterations that cannot be performed
9073        by the vector code.  */
9074   int bias_for_lowest = 1 - min_epilogue_iters;
9075   int bias_for_assumed = bias_for_lowest;
9076   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9077   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9078     {
9079       /* When the amount of peeling is known at compile time, the first
9080          iteration will have exactly alignment_npeels active elements.
9081          In the worst case it will have at least one.  */
9082       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9083       bias_for_lowest += lowest_vf - min_first_active;
9084       bias_for_assumed += assumed_vf - min_first_active;
9085     }
9086   /* In these calculations the "- 1" converts loop iteration counts
9087      back to latch counts.  */
9088   if (loop->any_upper_bound)
9089     loop->nb_iterations_upper_bound
9090       = (final_iter_may_be_partial
9091          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9092                           lowest_vf) - 1
9093          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9094                            lowest_vf) - 1);
9095   if (loop->any_likely_upper_bound)
9096     loop->nb_iterations_likely_upper_bound
9097       = (final_iter_may_be_partial
9098          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9099                           + bias_for_lowest, lowest_vf) - 1
9100          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9101                            + bias_for_lowest, lowest_vf) - 1);
9102   if (loop->any_estimate)
9103     loop->nb_iterations_estimate
9104       = (final_iter_may_be_partial
9105          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9106                           assumed_vf) - 1
9107          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9108                            assumed_vf) - 1);
9109
9110   if (dump_enabled_p ())
9111     {
9112       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9113         {
9114           dump_printf_loc (MSG_NOTE, vect_location,
9115                            "LOOP VECTORIZED\n");
9116           if (loop->inner)
9117             dump_printf_loc (MSG_NOTE, vect_location,
9118                              "OUTER LOOP VECTORIZED\n");
9119           dump_printf (MSG_NOTE, "\n");
9120         }
9121       else
9122         dump_printf_loc (MSG_NOTE, vect_location,
9123                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9124                          GET_MODE_NAME (loop_vinfo->vector_mode));
9125     }
9126
9127   /* Loops vectorized with a variable factor won't benefit from
9128      unrolling/peeling.  */
9129   if (!vf.is_constant ())
9130     {
9131       loop->unroll = 1;
9132       if (dump_enabled_p ())
9133         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9134                          " variable-length vectorization factor\n");
9135     }
9136   /* Free SLP instances here because otherwise stmt reference counting
9137      won't work.  */
9138   slp_instance instance;
9139   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9140     vect_free_slp_instance (instance, true);
9141   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9142   /* Clear-up safelen field since its value is invalid after vectorization
9143      since vectorized loop can have loop-carried dependencies.  */
9144   loop->safelen = 0;
9145
9146   if (epilogue)
9147     {
9148       update_epilogue_loop_vinfo (epilogue, advance);
9149
9150       epilogue->simduid = loop->simduid;
9151       epilogue->force_vectorize = loop->force_vectorize;
9152       epilogue->dont_vectorize = false;
9153     }
9154
9155   return epilogue;
9156 }
9157
9158 /* The code below is trying to perform simple optimization - revert
9159    if-conversion for masked stores, i.e. if the mask of a store is zero
9160    do not perform it and all stored value producers also if possible.
9161    For example,
9162      for (i=0; i<n; i++)
9163        if (c[i])
9164         {
9165           p1[i] += 1;
9166           p2[i] = p3[i] +2;
9167         }
9168    this transformation will produce the following semi-hammock:
9169
9170    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9171      {
9172        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9173        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9174        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9175        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9176        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9177        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9178      }
9179 */
9180
9181 void
9182 optimize_mask_stores (class loop *loop)
9183 {
9184   basic_block *bbs = get_loop_body (loop);
9185   unsigned nbbs = loop->num_nodes;
9186   unsigned i;
9187   basic_block bb;
9188   class loop *bb_loop;
9189   gimple_stmt_iterator gsi;
9190   gimple *stmt;
9191   auto_vec<gimple *> worklist;
9192   auto_purge_vect_location sentinel;
9193
9194   vect_location = find_loop_location (loop);
9195   /* Pick up all masked stores in loop if any.  */
9196   for (i = 0; i < nbbs; i++)
9197     {
9198       bb = bbs[i];
9199       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9200            gsi_next (&gsi))
9201         {
9202           stmt = gsi_stmt (gsi);
9203           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9204             worklist.safe_push (stmt);
9205         }
9206     }
9207
9208   free (bbs);
9209   if (worklist.is_empty ())
9210     return;
9211
9212   /* Loop has masked stores.  */
9213   while (!worklist.is_empty ())
9214     {
9215       gimple *last, *last_store;
9216       edge e, efalse;
9217       tree mask;
9218       basic_block store_bb, join_bb;
9219       gimple_stmt_iterator gsi_to;
9220       tree vdef, new_vdef;
9221       gphi *phi;
9222       tree vectype;
9223       tree zero;
9224
9225       last = worklist.pop ();
9226       mask = gimple_call_arg (last, 2);
9227       bb = gimple_bb (last);
9228       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9229          the same loop as if_bb.  It could be different to LOOP when two
9230          level loop-nest is vectorized and mask_store belongs to the inner
9231          one.  */
9232       e = split_block (bb, last);
9233       bb_loop = bb->loop_father;
9234       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9235       join_bb = e->dest;
9236       store_bb = create_empty_bb (bb);
9237       add_bb_to_loop (store_bb, bb_loop);
9238       e->flags = EDGE_TRUE_VALUE;
9239       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9240       /* Put STORE_BB to likely part.  */
9241       efalse->probability = profile_probability::unlikely ();
9242       store_bb->count = efalse->count ();
9243       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9244       if (dom_info_available_p (CDI_DOMINATORS))
9245         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9246       if (dump_enabled_p ())
9247         dump_printf_loc (MSG_NOTE, vect_location,
9248                          "Create new block %d to sink mask stores.",
9249                          store_bb->index);
9250       /* Create vector comparison with boolean result.  */
9251       vectype = TREE_TYPE (mask);
9252       zero = build_zero_cst (vectype);
9253       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9254       gsi = gsi_last_bb (bb);
9255       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9256       /* Create new PHI node for vdef of the last masked store:
9257          .MEM_2 = VDEF <.MEM_1>
9258          will be converted to
9259          .MEM.3 = VDEF <.MEM_1>
9260          and new PHI node will be created in join bb
9261          .MEM_2 = PHI <.MEM_1, .MEM_3>
9262       */
9263       vdef = gimple_vdef (last);
9264       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9265       gimple_set_vdef (last, new_vdef);
9266       phi = create_phi_node (vdef, join_bb);
9267       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9268
9269       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9270       while (true)
9271         {
9272           gimple_stmt_iterator gsi_from;
9273           gimple *stmt1 = NULL;
9274
9275           /* Move masked store to STORE_BB.  */
9276           last_store = last;
9277           gsi = gsi_for_stmt (last);
9278           gsi_from = gsi;
9279           /* Shift GSI to the previous stmt for further traversal.  */
9280           gsi_prev (&gsi);
9281           gsi_to = gsi_start_bb (store_bb);
9282           gsi_move_before (&gsi_from, &gsi_to);
9283           /* Setup GSI_TO to the non-empty block start.  */
9284           gsi_to = gsi_start_bb (store_bb);
9285           if (dump_enabled_p ())
9286             dump_printf_loc (MSG_NOTE, vect_location,
9287                              "Move stmt to created bb\n%G", last);
9288           /* Move all stored value producers if possible.  */
9289           while (!gsi_end_p (gsi))
9290             {
9291               tree lhs;
9292               imm_use_iterator imm_iter;
9293               use_operand_p use_p;
9294               bool res;
9295
9296               /* Skip debug statements.  */
9297               if (is_gimple_debug (gsi_stmt (gsi)))
9298                 {
9299                   gsi_prev (&gsi);
9300                   continue;
9301                 }
9302               stmt1 = gsi_stmt (gsi);
9303               /* Do not consider statements writing to memory or having
9304                  volatile operand.  */
9305               if (gimple_vdef (stmt1)
9306                   || gimple_has_volatile_ops (stmt1))
9307                 break;
9308               gsi_from = gsi;
9309               gsi_prev (&gsi);
9310               lhs = gimple_get_lhs (stmt1);
9311               if (!lhs)
9312                 break;
9313
9314               /* LHS of vectorized stmt must be SSA_NAME.  */
9315               if (TREE_CODE (lhs) != SSA_NAME)
9316                 break;
9317
9318               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9319                 {
9320                   /* Remove dead scalar statement.  */
9321                   if (has_zero_uses (lhs))
9322                     {
9323                       gsi_remove (&gsi_from, true);
9324                       continue;
9325                     }
9326                 }
9327
9328               /* Check that LHS does not have uses outside of STORE_BB.  */
9329               res = true;
9330               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9331                 {
9332                   gimple *use_stmt;
9333                   use_stmt = USE_STMT (use_p);
9334                   if (is_gimple_debug (use_stmt))
9335                     continue;
9336                   if (gimple_bb (use_stmt) != store_bb)
9337                     {
9338                       res = false;
9339                       break;
9340                     }
9341                 }
9342               if (!res)
9343                 break;
9344
9345               if (gimple_vuse (stmt1)
9346                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9347                 break;
9348
9349               /* Can move STMT1 to STORE_BB.  */
9350               if (dump_enabled_p ())
9351                 dump_printf_loc (MSG_NOTE, vect_location,
9352                                  "Move stmt to created bb\n%G", stmt1);
9353               gsi_move_before (&gsi_from, &gsi_to);
9354               /* Shift GSI_TO for further insertion.  */
9355               gsi_prev (&gsi_to);
9356             }
9357           /* Put other masked stores with the same mask to STORE_BB.  */
9358           if (worklist.is_empty ()
9359               || gimple_call_arg (worklist.last (), 2) != mask
9360               || worklist.last () != stmt1)
9361             break;
9362           last = worklist.pop ();
9363         }
9364       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9365     }
9366 }
9367
9368 /* Decide whether it is possible to use a zero-based induction variable
9369    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9370    the value that the induction variable must be able to hold in order
9371    to ensure that the rgroups eventually have no active vector elements.
9372    Return -1 otherwise.  */
9373
9374 widest_int
9375 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9376 {
9377   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9378   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9379   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9380
9381   /* Calculate the value that the induction variable must be able
9382      to hit in order to ensure that we end the loop with an all-false mask.
9383      This involves adding the maximum number of inactive trailing scalar
9384      iterations.  */
9385   widest_int iv_limit = -1;
9386   if (max_loop_iterations (loop, &iv_limit))
9387     {
9388       if (niters_skip)
9389         {
9390           /* Add the maximum number of skipped iterations to the
9391              maximum iteration count.  */
9392           if (TREE_CODE (niters_skip) == INTEGER_CST)
9393             iv_limit += wi::to_widest (niters_skip);
9394           else
9395             iv_limit += max_vf - 1;
9396         }
9397       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9398         /* Make a conservatively-correct assumption.  */
9399         iv_limit += max_vf - 1;
9400
9401       /* IV_LIMIT is the maximum number of latch iterations, which is also
9402          the maximum in-range IV value.  Round this value down to the previous
9403          vector alignment boundary and then add an extra full iteration.  */
9404       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9405       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9406     }
9407   return iv_limit;
9408 }
9409
9410 /* For the given rgroup_controls RGC, check whether an induction variable
9411    would ever hit a value that produces a set of all-false masks or zero
9412    lengths before wrapping around.  Return true if it's possible to wrap
9413    around before hitting the desirable value, otherwise return false.  */
9414
9415 bool
9416 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9417 {
9418   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9419
9420   if (iv_limit == -1)
9421     return true;
9422
9423   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9424   unsigned int compare_precision = TYPE_PRECISION (compare_type);
9425   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9426
9427   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9428     return true;
9429
9430   return false;
9431 }