gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     if (STMT_VINFO_IN_PATTERN_P (first))
 670       {
 671         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672         while (next)
 673           {
 674             if (! STMT_VINFO_IN_PATTERN_P (next)
 675                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 676               break;
 677             next = REDUC_GROUP_NEXT_ELEMENT (next);
 678           }
 679         /* If not all stmt in the chain are patterns or if we failed
 680            to update STMT_VINFO_REDUC_IDX try to handle the chain
 681            without patterns.  */
 682         if (! next
 683             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 684           {
 685             vect_fixup_reduc_chain (first);
 686             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 687               = STMT_VINFO_RELATED_STMT (first);
 688           }
 689       }
 690 }
 691
 692 /* Function vect_get_loop_niters.
 693
 694    Determine how many iterations the loop is executed and place it
 695    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 696    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 697    niter information holds in ASSUMPTIONS.
 698
 699    Return the loop exit condition.  */
 700
 701
 702 static gcond *
 703 vect_get_loop_niters (class loop *loop, tree *assumptions,
 704                       tree *number_of_iterations, tree *number_of_iterationsm1)
 705 {
 706   edge exit = single_exit (loop);
 707   class tree_niter_desc niter_desc;
 708   tree niter_assumptions, niter, may_be_zero;
 709   gcond *cond = get_loop_exit_condition (loop);
 710
 711   *assumptions = boolean_true_node;
 712   *number_of_iterationsm1 = chrec_dont_know;
 713   *number_of_iterations = chrec_dont_know;
 714   DUMP_VECT_SCOPE ("get_loop_niters");
 715
 716   if (!exit)
 717     return cond;
 718
 719   may_be_zero = NULL_TREE;
 720   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 721       || chrec_contains_undetermined (niter_desc.niter))
 722     return cond;
 723
 724   niter_assumptions = niter_desc.assumptions;
 725   may_be_zero = niter_desc.may_be_zero;
 726   niter = niter_desc.niter;
 727
 728   if (may_be_zero && integer_zerop (may_be_zero))
 729     may_be_zero = NULL_TREE;
 730
 731   if (may_be_zero)
 732     {
 733       if (COMPARISON_CLASS_P (may_be_zero))
 734         {
 735           /* Try to combine may_be_zero with assumptions, this can simplify
 736              computation of niter expression.  */
 737           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 738             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 739                                              niter_assumptions,
 740                                              fold_build1 (TRUTH_NOT_EXPR,
 741                                                           boolean_type_node,
 742                                                           may_be_zero));
 743           else
 744             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 745                                  build_int_cst (TREE_TYPE (niter), 0),
 746                                  rewrite_to_non_trapping_overflow (niter));
 747
 748           may_be_zero = NULL_TREE;
 749         }
 750       else if (integer_nonzerop (may_be_zero))
 751         {
 752           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 753           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 754           return cond;
 755         }
 756       else
 757         return cond;
 758     }
 759
 760   *assumptions = niter_assumptions;
 761   *number_of_iterationsm1 = niter;
 762
 763   /* We want the number of loop header executions which is the number
 764      of latch executions plus one.
 765      ???  For UINT_MAX latch executions this number overflows to zero
 766      for loops like do { n++; } while (n != 0);  */
 767   if (niter && !chrec_contains_undetermined (niter))
 768     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 769                           build_int_cst (TREE_TYPE (niter), 1));
 770   *number_of_iterations = niter;
 771
 772   return cond;
 773 }
 774
 775 /* Function bb_in_loop_p
 776
 777    Used as predicate for dfs order traversal of the loop bbs.  */
 778
 779 static bool
 780 bb_in_loop_p (const_basic_block bb, const void *data)
 781 {
 782   const class loop *const loop = (const class loop *)data;
 783   if (flow_bb_inside_loop_p (loop, bb))
 784     return true;
 785   return false;
 786 }
 787
 788
 789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 790    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 791
 792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 793   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 794     loop (loop_in),
 795     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 796     num_itersm1 (NULL_TREE),
 797     num_iters (NULL_TREE),
 798     num_iters_unchanged (NULL_TREE),
 799     num_iters_assumptions (NULL_TREE),
 800     th (0),
 801     versioning_threshold (0),
 802     vectorization_factor (0),
 803     max_vectorization_factor (0),
 804     mask_skip_niters (NULL_TREE),
 805     rgroup_compare_type (NULL_TREE),
 806     simd_if_cond (NULL_TREE),
 807     unaligned_dr (NULL),
 808     peeling_for_alignment (0),
 809     ptr_mask (0),
 810     ivexpr_map (NULL),
 811     scan_map (NULL),
 812     slp_unrolling_factor (1),
 813     single_scalar_iteration_cost (0),
 814     vec_outside_cost (0),
 815     vec_inside_cost (0),
 816     vectorizable (false),
 817     can_use_partial_vectors_p (true),
 818     using_partial_vectors_p (false),
 819     epil_using_partial_vectors_p (false),
 820     peeling_for_gaps (false),
 821     peeling_for_niter (false),
 822     no_data_dependencies (false),
 823     has_mask_store (false),
 824     scalar_loop_scaling (profile_probability::uninitialized ()),
 825     scalar_loop (NULL),
 826     orig_loop_info (NULL)
 827 {
 828   /* CHECKME: We want to visit all BBs before their successors (except for
 829      latch blocks, for which this assertion wouldn't hold).  In the simple
 830      case of the loop forms we allow, a dfs order of the BBs would the same
 831      as reversed postorder traversal, so we are safe.  */
 832
 833   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 834                                           bbs, loop->num_nodes, loop);
 835   gcc_assert (nbbs == loop->num_nodes);
 836
 837   for (unsigned int i = 0; i < nbbs; i++)
 838     {
 839       basic_block bb = bbs[i];
 840       gimple_stmt_iterator si;
 841
 842       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 843         {
 844           gimple *phi = gsi_stmt (si);
 845           gimple_set_uid (phi, 0);
 846           add_stmt (phi);
 847         }
 848
 849       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 850         {
 851           gimple *stmt = gsi_stmt (si);
 852           gimple_set_uid (stmt, 0);
 853           if (is_gimple_debug (stmt))
 854             continue;
 855           add_stmt (stmt);
 856           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 857              third argument is the #pragma omp simd if (x) condition, when 0,
 858              loop shouldn't be vectorized, when non-zero constant, it should
 859              be vectorized normally, otherwise versioned with vectorized loop
 860              done if the condition is non-zero at runtime.  */
 861           if (loop_in->simduid
 862               && is_gimple_call (stmt)
 863               && gimple_call_internal_p (stmt)
 864               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 865               && gimple_call_num_args (stmt) >= 3
 866               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 867               && (loop_in->simduid
 868                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 869             {
 870               tree arg = gimple_call_arg (stmt, 2);
 871               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 872                 simd_if_cond = arg;
 873               else
 874                 gcc_assert (integer_nonzerop (arg));
 875             }
 876         }
 877     }
 878
 879   epilogue_vinfos.create (6);
 880 }
 881
 882 /* Free all levels of rgroup CONTROLS.  */
 883
 884 void
 885 release_vec_loop_controls (vec<rgroup_controls> *controls)
 886 {
 887   rgroup_controls *rgc;
 888   unsigned int i;
 889   FOR_EACH_VEC_ELT (*controls, i, rgc)
 890     rgc->controls.release ();
 891   controls->release ();
 892 }
 893
 894 /* Free all memory used by the _loop_vec_info, as well as all the
 895    stmt_vec_info structs of all the stmts in the loop.  */
 896
 897 _loop_vec_info::~_loop_vec_info ()
 898 {
 899   free (bbs);
 900
 901   release_vec_loop_controls (&masks);
 902   release_vec_loop_controls (&lens);
 903   delete ivexpr_map;
 904   delete scan_map;
 905   epilogue_vinfos.release ();
 906
 907   loop->aux = NULL;
 908 }
 909
 910 /* Return an invariant or register for EXPR and emit necessary
 911    computations in the LOOP_VINFO loop preheader.  */
 912
 913 tree
 914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 915 {
 916   if (is_gimple_reg (expr)
 917       || is_gimple_min_invariant (expr))
 918     return expr;
 919
 920   if (! loop_vinfo->ivexpr_map)
 921     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 922   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 923   if (! cached)
 924     {
 925       gimple_seq stmts = NULL;
 926       cached = force_gimple_operand (unshare_expr (expr),
 927                                      &stmts, true, NULL_TREE);
 928       if (stmts)
 929         {
 930           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 931           gsi_insert_seq_on_edge_immediate (e, stmts);
 932         }
 933     }
 934   return cached;
 935 }
 936
 937 /* Return true if we can use CMP_TYPE as the comparison type to produce
 938    all masks required to mask LOOP_VINFO.  */
 939
 940 static bool
 941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 942 {
 943   rgroup_controls *rgm;
 944   unsigned int i;
 945   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 946     if (rgm->type != NULL_TREE
 947         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 948                                             cmp_type, rgm->type,
 949                                             OPTIMIZE_FOR_SPEED))
 950       return false;
 951   return true;
 952 }
 953
 954 /* Calculate the maximum number of scalars per iteration for every
 955    rgroup in LOOP_VINFO.  */
 956
 957 static unsigned int
 958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 959 {
 960   unsigned int res = 1;
 961   unsigned int i;
 962   rgroup_controls *rgm;
 963   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 964     res = MAX (res, rgm->max_nscalars_per_iter);
 965   return res;
 966 }
 967
 968 /* Calculate the minimum precision necessary to represent:
 969
 970       MAX_NITERS * FACTOR
 971
 972    as an unsigned integer, where MAX_NITERS is the maximum number of
 973    loop header iterations for the original scalar form of LOOP_VINFO.  */
 974
 975 static unsigned
 976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
 977 {
 978   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 979
 980   /* Get the maximum number of iterations that is representable
 981      in the counter type.  */
 982   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 983   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 984
 985   /* Get a more refined estimate for the number of iterations.  */
 986   widest_int max_back_edges;
 987   if (max_loop_iterations (loop, &max_back_edges))
 988     max_ni = wi::smin (max_ni, max_back_edges + 1);
 989
 990   /* Work out how many bits we need to represent the limit.  */
 991   return wi::min_precision (max_ni * factor, UNSIGNED);
 992 }
 993
 994 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 995    whether we can actually generate the masks required.  Return true if so,
 996    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
 997
 998 static bool
 999 vect_verify_full_masking (loop_vec_info loop_vinfo)
1000 {
1001   unsigned int min_ni_width;
1002   unsigned int max_nscalars_per_iter
1003     = vect_get_max_nscalars_per_iter (loop_vinfo);
1004
1005   /* Use a normal loop if there are no statements that need masking.
1006      This only happens in rare degenerate cases: it means that the loop
1007      has no loads, no stores, and no live-out values.  */
1008   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009     return false;
1010
1011   /* Work out how many bits we need to represent the limit.  */
1012   min_ni_width
1013     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1014
1015   /* Find a scalar mode for which WHILE_ULT is supported.  */
1016   opt_scalar_int_mode cmp_mode_iter;
1017   tree cmp_type = NULL_TREE;
1018   tree iv_type = NULL_TREE;
1019   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1020   unsigned int iv_precision = UINT_MAX;
1021
1022   if (iv_limit != -1)
1023     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1024                                       UNSIGNED);
1025
1026   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1027     {
1028       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1029       if (cmp_bits >= min_ni_width
1030           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1031         {
1032           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1033           if (this_type
1034               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1035             {
1036               /* Although we could stop as soon as we find a valid mode,
1037                  there are at least two reasons why that's not always the
1038                  best choice:
1039
1040                  - An IV that's Pmode or wider is more likely to be reusable
1041                    in address calculations than an IV that's narrower than
1042                    Pmode.
1043
1044                  - Doing the comparison in IV_PRECISION or wider allows
1045                    a natural 0-based IV, whereas using a narrower comparison
1046                    type requires mitigations against wrap-around.
1047
1048                  Conversely, if the IV limit is variable, doing the comparison
1049                  in a wider type than the original type can introduce
1050                  unnecessary extensions, so picking the widest valid mode
1051                  is not always a good choice either.
1052
1053                  Here we prefer the first IV type that's Pmode or wider,
1054                  and the first comparison type that's IV_PRECISION or wider.
1055                  (The comparison type must be no wider than the IV type,
1056                  to avoid extensions in the vector loop.)
1057
1058                  ??? We might want to try continuing beyond Pmode for ILP32
1059                  targets if CMP_BITS < IV_PRECISION.  */
1060               iv_type = this_type;
1061               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1062                 cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1074   return true;
1075 }
1076
1077 /* Check whether we can use vector access with length based on precison
1078    comparison.  So far, to keep it simple, we only allow the case that the
1079    precision of the target supported length is larger than the precision
1080    required by loop niters.  */
1081
1082 static bool
1083 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1084 {
1085   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1086     return false;
1087
1088   unsigned int max_nitems_per_iter = 1;
1089   unsigned int i;
1090   rgroup_controls *rgl;
1091   /* Find the maximum number of items per iteration for every rgroup.  */
1092   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1093     {
1094       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1095       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1096     }
1097
1098   /* Work out how many bits we need to represent the length limit.  */
1099   unsigned int min_ni_prec
1100     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1101
1102   /* Now use the maximum of below precisions for one suitable IV type:
1103      - the IV's natural precision
1104      - the precision needed to hold: the maximum number of scalar
1105        iterations multiplied by the scale factor (min_ni_prec above)
1106      - the Pmode precision
1107
1108      If min_ni_prec is less than the precision of the current niters,
1109      we perfer to still use the niters type.  Prefer to use Pmode and
1110      wider IV to avoid narrow conversions.  */
1111
1112   unsigned int ni_prec
1113     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1114   min_ni_prec = MAX (min_ni_prec, ni_prec);
1115   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1116
1117   tree iv_type = NULL_TREE;
1118   opt_scalar_int_mode tmode_iter;
1119   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1120     {
1121       scalar_mode tmode = tmode_iter.require ();
1122       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1123
1124       /* ??? Do we really want to construct one IV whose precision exceeds
1125          BITS_PER_WORD?  */
1126       if (tbits > BITS_PER_WORD)
1127         break;
1128
1129       /* Find the first available standard integral type.  */
1130       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1131         {
1132           iv_type = build_nonstandard_integer_type (tbits, true);
1133           break;
1134         }
1135     }
1136
1137   if (!iv_type)
1138     {
1139       if (dump_enabled_p ())
1140         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                          "can't vectorize with length-based partial vectors"
1142                          " because there is no suitable iv type.\n");
1143       return false;
1144     }
1145
1146   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1147   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1148
1149   return true;
1150 }
1151
1152 /* Calculate the cost of one scalar iteration of the loop.  */
1153 static void
1154 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1155 {
1156   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1157   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1158   int nbbs = loop->num_nodes, factor;
1159   int innerloop_iters, i;
1160
1161   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1162
1163   /* Gather costs for statements in the scalar loop.  */
1164
1165   /* FORNOW.  */
1166   innerloop_iters = 1;
1167   if (loop->inner)
1168     innerloop_iters = 50; /* FIXME */
1169
1170   for (i = 0; i < nbbs; i++)
1171     {
1172       gimple_stmt_iterator si;
1173       basic_block bb = bbs[i];
1174
1175       if (bb->loop_father == loop->inner)
1176         factor = innerloop_iters;
1177       else
1178         factor = 1;
1179
1180       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1181         {
1182           gimple *stmt = gsi_stmt (si);
1183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1184
1185           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1186             continue;
1187
1188           /* Skip stmts that are not vectorized inside the loop.  */
1189           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1190           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1191               && (!STMT_VINFO_LIVE_P (vstmt_info)
1192                   || !VECTORIZABLE_CYCLE_DEF
1193                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1194             continue;
1195
1196           vect_cost_for_stmt kind;
1197           if (STMT_VINFO_DATA_REF (stmt_info))
1198             {
1199               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1200                kind = scalar_load;
1201              else
1202                kind = scalar_store;
1203             }
1204           else if (vect_nop_conversion_p (stmt_info))
1205             continue;
1206           else
1207             kind = scalar_stmt;
1208
1209           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1210                             factor, kind, stmt_info, 0, vect_prologue);
1211         }
1212     }
1213
1214   /* Now accumulate cost.  */
1215   void *target_cost_data = init_cost (loop);
1216   stmt_info_for_cost *si;
1217   int j;
1218   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1219                     j, si)
1220     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1221                           si->kind, si->stmt_info, si->vectype,
1222                           si->misalign, vect_body);
1223   unsigned dummy, body_cost = 0;
1224   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1225   destroy_cost_data (target_cost_data);
1226   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1227 }
1228
1229
1230 /* Function vect_analyze_loop_form_1.
1231
1232    Verify that certain CFG restrictions hold, including:
1233    - the loop has a pre-header
1234    - the loop has a single entry and exit
1235    - the loop exit condition is simple enough
1236    - the number of iterations can be analyzed, i.e, a countable loop.  The
1237      niter could be analyzed under some assumptions.  */
1238
1239 opt_result
1240 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1241                           tree *assumptions, tree *number_of_iterationsm1,
1242                           tree *number_of_iterations, gcond **inner_loop_cond)
1243 {
1244   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1245
1246   /* Different restrictions apply when we are considering an inner-most loop,
1247      vs. an outer (nested) loop.
1248      (FORNOW. May want to relax some of these restrictions in the future).  */
1249
1250   if (!loop->inner)
1251     {
1252       /* Inner-most loop.  We currently require that the number of BBs is
1253          exactly 2 (the header and latch).  Vectorizable inner-most loops
1254          look like this:
1255
1256                         (pre-header)
1257                            |
1258                           header <--------+
1259                            | |            |
1260                            | +--> latch --+
1261                            |
1262                         (exit-bb)  */
1263
1264       if (loop->num_nodes != 2)
1265         return opt_result::failure_at (vect_location,
1266                                        "not vectorized:"
1267                                        " control flow in loop.\n");
1268
1269       if (empty_block_p (loop->header))
1270         return opt_result::failure_at (vect_location,
1271                                        "not vectorized: empty loop.\n");
1272     }
1273   else
1274     {
1275       class loop *innerloop = loop->inner;
1276       edge entryedge;
1277
1278       /* Nested loop. We currently require that the loop is doubly-nested,
1279          contains a single inner loop, and the number of BBs is exactly 5.
1280          Vectorizable outer-loops look like this:
1281
1282                         (pre-header)
1283                            |
1284                           header <---+
1285                            |         |
1286                           inner-loop |
1287                            |         |
1288                           tail ------+
1289                            |
1290                         (exit-bb)
1291
1292          The inner-loop has the properties expected of inner-most loops
1293          as described above.  */
1294
1295       if ((loop->inner)->inner || (loop->inner)->next)
1296         return opt_result::failure_at (vect_location,
1297                                        "not vectorized:"
1298                                        " multiple nested loops.\n");
1299
1300       if (loop->num_nodes != 5)
1301         return opt_result::failure_at (vect_location,
1302                                        "not vectorized:"
1303                                        " control flow in loop.\n");
1304
1305       entryedge = loop_preheader_edge (innerloop);
1306       if (entryedge->src != loop->header
1307           || !single_exit (innerloop)
1308           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1309         return opt_result::failure_at (vect_location,
1310                                        "not vectorized:"
1311                                        " unsupported outerloop form.\n");
1312
1313       /* Analyze the inner-loop.  */
1314       tree inner_niterm1, inner_niter, inner_assumptions;
1315       opt_result res
1316         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1317                                     &inner_assumptions, &inner_niterm1,
1318                                     &inner_niter, NULL);
1319       if (!res)
1320         {
1321           if (dump_enabled_p ())
1322             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323                              "not vectorized: Bad inner loop.\n");
1324           return res;
1325         }
1326
1327       /* Don't support analyzing niter under assumptions for inner
1328          loop.  */
1329       if (!integer_onep (inner_assumptions))
1330         return opt_result::failure_at (vect_location,
1331                                        "not vectorized: Bad inner loop.\n");
1332
1333       if (!expr_invariant_in_loop_p (loop, inner_niter))
1334         return opt_result::failure_at (vect_location,
1335                                        "not vectorized: inner-loop count not"
1336                                        " invariant.\n");
1337
1338       if (dump_enabled_p ())
1339         dump_printf_loc (MSG_NOTE, vect_location,
1340                          "Considering outer-loop vectorization.\n");
1341     }
1342
1343   if (!single_exit (loop))
1344     return opt_result::failure_at (vect_location,
1345                                    "not vectorized: multiple exits.\n");
1346   if (EDGE_COUNT (loop->header->preds) != 2)
1347     return opt_result::failure_at (vect_location,
1348                                    "not vectorized:"
1349                                    " too many incoming edges.\n");
1350
1351   /* We assume that the loop exit condition is at the end of the loop. i.e,
1352      that the loop is represented as a do-while (with a proper if-guard
1353      before the loop if needed), where the loop header contains all the
1354      executable statements, and the latch is empty.  */
1355   if (!empty_block_p (loop->latch)
1356       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1357     return opt_result::failure_at (vect_location,
1358                                    "not vectorized: latch block not empty.\n");
1359
1360   /* Make sure the exit is not abnormal.  */
1361   edge e = single_exit (loop);
1362   if (e->flags & EDGE_ABNORMAL)
1363     return opt_result::failure_at (vect_location,
1364                                    "not vectorized:"
1365                                    " abnormal loop exit edge.\n");
1366
1367   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1368                                      number_of_iterationsm1);
1369   if (!*loop_cond)
1370     return opt_result::failure_at
1371       (vect_location,
1372        "not vectorized: complicated exit condition.\n");
1373
1374   if (integer_zerop (*assumptions)
1375       || !*number_of_iterations
1376       || chrec_contains_undetermined (*number_of_iterations))
1377     return opt_result::failure_at
1378       (*loop_cond,
1379        "not vectorized: number of iterations cannot be computed.\n");
1380
1381   if (integer_zerop (*number_of_iterations))
1382     return opt_result::failure_at
1383       (*loop_cond,
1384        "not vectorized: number of iterations = 0.\n");
1385
1386   return opt_result::success ();
1387 }
1388
1389 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1390
1391 opt_loop_vec_info
1392 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1393 {
1394   tree assumptions, number_of_iterations, number_of_iterationsm1;
1395   gcond *loop_cond, *inner_loop_cond = NULL;
1396
1397   opt_result res
1398     = vect_analyze_loop_form_1 (loop, &loop_cond,
1399                                 &assumptions, &number_of_iterationsm1,
1400                                 &number_of_iterations, &inner_loop_cond);
1401   if (!res)
1402     return opt_loop_vec_info::propagate_failure (res);
1403
1404   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1405   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1406   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1407   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1408   if (!integer_onep (assumptions))
1409     {
1410       /* We consider to vectorize this loop by versioning it under
1411          some assumptions.  In order to do this, we need to clear
1412          existing information computed by scev and niter analyzer.  */
1413       scev_reset_htab ();
1414       free_numbers_of_iterations_estimates (loop);
1415       /* Also set flag for this loop so that following scev and niter
1416          analysis are done under the assumptions.  */
1417       loop_constraint_set (loop, LOOP_C_FINITE);
1418       /* Also record the assumptions for versioning.  */
1419       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1420     }
1421
1422   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1423     {
1424       if (dump_enabled_p ())
1425         {
1426           dump_printf_loc (MSG_NOTE, vect_location,
1427                            "Symbolic number of iterations is ");
1428           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1429           dump_printf (MSG_NOTE, "\n");
1430         }
1431     }
1432
1433   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1434   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1435   if (inner_loop_cond)
1436     {
1437       stmt_vec_info inner_loop_cond_info
1438         = loop_vinfo->lookup_stmt (inner_loop_cond);
1439       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1440     }
1441
1442   gcc_assert (!loop->aux);
1443   loop->aux = loop_vinfo;
1444   return opt_loop_vec_info::success (loop_vinfo);
1445 }
1446
1447
1448
1449 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1450    statements update the vectorization factor.  */
1451
1452 static void
1453 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1454 {
1455   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1456   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1457   int nbbs = loop->num_nodes;
1458   poly_uint64 vectorization_factor;
1459   int i;
1460
1461   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1462
1463   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1464   gcc_assert (known_ne (vectorization_factor, 0U));
1465
1466   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1467      vectorization factor of the loop is the unrolling factor required by
1468      the SLP instances.  If that unrolling factor is 1, we say, that we
1469      perform pure SLP on loop - cross iteration parallelism is not
1470      exploited.  */
1471   bool only_slp_in_loop = true;
1472   for (i = 0; i < nbbs; i++)
1473     {
1474       basic_block bb = bbs[i];
1475       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1476            gsi_next (&si))
1477         {
1478           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1479           if (!stmt_info)
1480             continue;
1481           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1482                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1483               && !PURE_SLP_STMT (stmt_info))
1484             /* STMT needs both SLP and loop-based vectorization.  */
1485             only_slp_in_loop = false;
1486         }
1487       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1488            gsi_next (&si))
1489         {
1490           if (is_gimple_debug (gsi_stmt (si)))
1491             continue;
1492           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1493           stmt_info = vect_stmt_to_vectorize (stmt_info);
1494           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496               && !PURE_SLP_STMT (stmt_info))
1497             /* STMT needs both SLP and loop-based vectorization.  */
1498             only_slp_in_loop = false;
1499         }
1500     }
1501
1502   if (only_slp_in_loop)
1503     {
1504       if (dump_enabled_p ())
1505         dump_printf_loc (MSG_NOTE, vect_location,
1506                          "Loop contains only SLP stmts\n");
1507       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1508     }
1509   else
1510     {
1511       if (dump_enabled_p ())
1512         dump_printf_loc (MSG_NOTE, vect_location,
1513                          "Loop contains SLP and non-SLP stmts\n");
1514       /* Both the vectorization factor and unroll factor have the form
1515          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1516          so they must have a common multiple.  */
1517       vectorization_factor
1518         = force_common_multiple (vectorization_factor,
1519                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1520     }
1521
1522   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1523   if (dump_enabled_p ())
1524     {
1525       dump_printf_loc (MSG_NOTE, vect_location,
1526                        "Updating vectorization factor to ");
1527       dump_dec (MSG_NOTE, vectorization_factor);
1528       dump_printf (MSG_NOTE, ".\n");
1529     }
1530 }
1531
1532 /* Return true if STMT_INFO describes a double reduction phi and if
1533    the other phi in the reduction is also relevant for vectorization.
1534    This rejects cases such as:
1535
1536       outer1:
1537         x_1 = PHI <x_3(outer2), ...>;
1538         ...
1539
1540       inner:
1541         x_2 = ...;
1542         ...
1543
1544       outer2:
1545         x_3 = PHI <x_2(inner)>;
1546
1547    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1548
1549 static bool
1550 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1551 {
1552   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1553     return false;
1554
1555   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1556 }
1557
1558 /* Function vect_analyze_loop_operations.
1559
1560    Scan the loop stmts and make sure they are all vectorizable.  */
1561
1562 static opt_result
1563 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1564 {
1565   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567   int nbbs = loop->num_nodes;
1568   int i;
1569   stmt_vec_info stmt_info;
1570   bool need_to_vectorize = false;
1571   bool ok;
1572
1573   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1574
1575   auto_vec<stmt_info_for_cost> cost_vec;
1576
1577   for (i = 0; i < nbbs; i++)
1578     {
1579       basic_block bb = bbs[i];
1580
1581       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1582            gsi_next (&si))
1583         {
1584           gphi *phi = si.phi ();
1585           ok = true;
1586
1587           stmt_info = loop_vinfo->lookup_stmt (phi);
1588           if (dump_enabled_p ())
1589             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1590           if (virtual_operand_p (gimple_phi_result (phi)))
1591             continue;
1592
1593           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1594              (i.e., a phi in the tail of the outer-loop).  */
1595           if (! is_loop_header_bb_p (bb))
1596             {
1597               /* FORNOW: we currently don't support the case that these phis
1598                  are not used in the outerloop (unless it is double reduction,
1599                  i.e., this phi is vect_reduction_def), cause this case
1600                  requires to actually do something here.  */
1601               if (STMT_VINFO_LIVE_P (stmt_info)
1602                   && !vect_active_double_reduction_p (stmt_info))
1603                 return opt_result::failure_at (phi,
1604                                                "Unsupported loop-closed phi"
1605                                                " in outer-loop.\n");
1606
1607               /* If PHI is used in the outer loop, we check that its operand
1608                  is defined in the inner loop.  */
1609               if (STMT_VINFO_RELEVANT_P (stmt_info))
1610                 {
1611                   tree phi_op;
1612
1613                   if (gimple_phi_num_args (phi) != 1)
1614                     return opt_result::failure_at (phi, "unsupported phi");
1615
1616                   phi_op = PHI_ARG_DEF (phi, 0);
1617                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1618                   if (!op_def_info)
1619                     return opt_result::failure_at (phi, "unsupported phi\n");
1620
1621                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1622                       && (STMT_VINFO_RELEVANT (op_def_info)
1623                           != vect_used_in_outer_by_reduction))
1624                     return opt_result::failure_at (phi, "unsupported phi\n");
1625
1626                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1627                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1628                            == vect_double_reduction_def))
1629                       && !vectorizable_lc_phi (loop_vinfo,
1630                                                stmt_info, NULL, NULL))
1631                     return opt_result::failure_at (phi, "unsupported phi\n");
1632                 }
1633
1634               continue;
1635             }
1636
1637           gcc_assert (stmt_info);
1638
1639           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1640                || STMT_VINFO_LIVE_P (stmt_info))
1641               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1642             /* A scalar-dependence cycle that we don't support.  */
1643             return opt_result::failure_at (phi,
1644                                            "not vectorized:"
1645                                            " scalar dependence cycle.\n");
1646
1647           if (STMT_VINFO_RELEVANT_P (stmt_info))
1648             {
1649               need_to_vectorize = true;
1650               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1651                   && ! PURE_SLP_STMT (stmt_info))
1652                 ok = vectorizable_induction (loop_vinfo,
1653                                              stmt_info, NULL, NULL,
1654                                              &cost_vec);
1655               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1656                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1657                             == vect_double_reduction_def)
1658                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1659                        && ! PURE_SLP_STMT (stmt_info))
1660                 ok = vectorizable_reduction (loop_vinfo,
1661                                              stmt_info, NULL, NULL, &cost_vec);
1662             }
1663
1664           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1665           if (ok
1666               && STMT_VINFO_LIVE_P (stmt_info)
1667               && !PURE_SLP_STMT (stmt_info))
1668             ok = vectorizable_live_operation (loop_vinfo,
1669                                               stmt_info, NULL, NULL, NULL,
1670                                               -1, false, &cost_vec);
1671
1672           if (!ok)
1673             return opt_result::failure_at (phi,
1674                                            "not vectorized: relevant phi not "
1675                                            "supported: %G",
1676                                            static_cast <gimple *> (phi));
1677         }
1678
1679       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1680            gsi_next (&si))
1681         {
1682           gimple *stmt = gsi_stmt (si);
1683           if (!gimple_clobber_p (stmt)
1684               && !is_gimple_debug (stmt))
1685             {
1686               opt_result res
1687                 = vect_analyze_stmt (loop_vinfo,
1688                                      loop_vinfo->lookup_stmt (stmt),
1689                                      &need_to_vectorize,
1690                                      NULL, NULL, &cost_vec);
1691               if (!res)
1692                 return res;
1693             }
1694         }
1695     } /* bbs */
1696
1697   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1698
1699   /* All operations in the loop are either irrelevant (deal with loop
1700      control, or dead), or only used outside the loop and can be moved
1701      out of the loop (e.g. invariants, inductions).  The loop can be
1702      optimized away by scalar optimizations.  We're better off not
1703      touching this loop.  */
1704   if (!need_to_vectorize)
1705     {
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location,
1708                          "All the computation can be taken out of the loop.\n");
1709       return opt_result::failure_at
1710         (vect_location,
1711          "not vectorized: redundant loop. no profit to vectorize.\n");
1712     }
1713
1714   return opt_result::success ();
1715 }
1716
1717 /* Return true if we know that the iteration count is smaller than the
1718    vectorization factor.  Return false if it isn't, or if we can't be sure
1719    either way.  */
1720
1721 static bool
1722 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1723 {
1724   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1725
1726   HOST_WIDE_INT max_niter;
1727   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1728     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1729   else
1730     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1731
1732   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1733     return true;
1734
1735   return false;
1736 }
1737
1738 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1739    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1740    definitely no, or -1 if it's worth retrying.  */
1741
1742 static int
1743 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1744 {
1745   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1746   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1747
1748   /* Only loops that can handle partially-populated vectors can have iteration
1749      counts less than the vectorization factor.  */
1750   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1751     {
1752       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1753         {
1754           if (dump_enabled_p ())
1755             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756                              "not vectorized: iteration count smaller than "
1757                              "vectorization factor.\n");
1758           return 0;
1759         }
1760     }
1761
1762   int min_profitable_iters, min_profitable_estimate;
1763   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1764                                       &min_profitable_estimate);
1765
1766   if (min_profitable_iters < 0)
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770                          "not vectorized: vectorization not profitable.\n");
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773                          "not vectorized: vector version will never be "
1774                          "profitable.\n");
1775       return -1;
1776     }
1777
1778   int min_scalar_loop_bound = (param_min_vect_loop_bound
1779                                * assumed_vf);
1780
1781   /* Use the cost model only if it is more conservative than user specified
1782      threshold.  */
1783   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1784                                     min_profitable_iters);
1785
1786   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1787
1788   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1789       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1790     {
1791       if (dump_enabled_p ())
1792         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1793                          "not vectorized: vectorization not profitable.\n");
1794       if (dump_enabled_p ())
1795         dump_printf_loc (MSG_NOTE, vect_location,
1796                          "not vectorized: iteration count smaller than user "
1797                          "specified loop bound parameter or minimum profitable "
1798                          "iterations (whichever is more conservative).\n");
1799       return 0;
1800     }
1801
1802   /* The static profitablity threshold min_profitable_estimate includes
1803      the cost of having to check at runtime whether the scalar loop
1804      should be used instead.  If it turns out that we don't need or want
1805      such a check, the threshold we should use for the static estimate
1806      is simply the point at which the vector loop becomes more profitable
1807      than the scalar loop.  */
1808   if (min_profitable_estimate > min_profitable_iters
1809       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1810       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1811       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1812       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1813     {
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1816                          " choice between the scalar and vector loops\n");
1817       min_profitable_estimate = min_profitable_iters;
1818     }
1819
1820   HOST_WIDE_INT estimated_niter;
1821
1822   /* If we are vectorizing an epilogue then we know the maximum number of
1823      scalar iterations it will cover is at least one lower than the
1824      vectorization factor of the main loop.  */
1825   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1826     estimated_niter
1827       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1828   else
1829     {
1830       estimated_niter = estimated_stmt_executions_int (loop);
1831       if (estimated_niter == -1)
1832         estimated_niter = likely_max_stmt_executions_int (loop);
1833     }
1834   if (estimated_niter != -1
1835       && ((unsigned HOST_WIDE_INT) estimated_niter
1836           < MAX (th, (unsigned) min_profitable_estimate)))
1837     {
1838       if (dump_enabled_p ())
1839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840                          "not vectorized: estimated iteration count too "
1841                          "small.\n");
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_NOTE, vect_location,
1844                          "not vectorized: estimated iteration count smaller "
1845                          "than specified loop bound parameter or minimum "
1846                          "profitable iterations (whichever is more "
1847                          "conservative).\n");
1848       return -1;
1849     }
1850
1851   return 1;
1852 }
1853
1854 static opt_result
1855 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1856                            vec<data_reference_p> *datarefs,
1857                            unsigned int *n_stmts)
1858 {
1859   *n_stmts = 0;
1860   for (unsigned i = 0; i < loop->num_nodes; i++)
1861     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1862          !gsi_end_p (gsi); gsi_next (&gsi))
1863       {
1864         gimple *stmt = gsi_stmt (gsi);
1865         if (is_gimple_debug (stmt))
1866           continue;
1867         ++(*n_stmts);
1868         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1869                                                         NULL, 0);
1870         if (!res)
1871           {
1872             if (is_gimple_call (stmt) && loop->safelen)
1873               {
1874                 tree fndecl = gimple_call_fndecl (stmt), op;
1875                 if (fndecl != NULL_TREE)
1876                   {
1877                     cgraph_node *node = cgraph_node::get (fndecl);
1878                     if (node != NULL && node->simd_clones != NULL)
1879                       {
1880                         unsigned int j, n = gimple_call_num_args (stmt);
1881                         for (j = 0; j < n; j++)
1882                           {
1883                             op = gimple_call_arg (stmt, j);
1884                             if (DECL_P (op)
1885                                 || (REFERENCE_CLASS_P (op)
1886                                     && get_base_address (op)))
1887                               break;
1888                           }
1889                         op = gimple_call_lhs (stmt);
1890                         /* Ignore #pragma omp declare simd functions
1891                            if they don't have data references in the
1892                            call stmt itself.  */
1893                         if (j == n
1894                             && !(op
1895                                  && (DECL_P (op)
1896                                      || (REFERENCE_CLASS_P (op)
1897                                          && get_base_address (op)))))
1898                           continue;
1899                       }
1900                   }
1901               }
1902             return res;
1903           }
1904         /* If dependence analysis will give up due to the limit on the
1905            number of datarefs stop here and fail fatally.  */
1906         if (datarefs->length ()
1907             > (unsigned)param_loop_max_datarefs_for_datadeps)
1908           return opt_result::failure_at (stmt, "exceeded param "
1909                                          "loop-max-datarefs-for-datadeps\n");
1910       }
1911   return opt_result::success ();
1912 }
1913
1914 /* Look for SLP-only access groups and turn each individual access into its own
1915    group.  */
1916 static void
1917 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1918 {
1919   unsigned int i;
1920   struct data_reference *dr;
1921
1922   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1923
1924   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1925   FOR_EACH_VEC_ELT (datarefs, i, dr)
1926     {
1927       gcc_assert (DR_REF (dr));
1928       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1929
1930       /* Check if the load is a part of an interleaving chain.  */
1931       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1932         {
1933           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1934           unsigned int group_size = DR_GROUP_SIZE (first_element);
1935
1936           /* Check if SLP-only groups.  */
1937           if (!STMT_SLP_TYPE (stmt_info)
1938               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1939             {
1940               /* Dissolve the group.  */
1941               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1942
1943               stmt_vec_info vinfo = first_element;
1944               while (vinfo)
1945                 {
1946                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1947                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1948                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1949                   DR_GROUP_SIZE (vinfo) = 1;
1950                   if (STMT_VINFO_STRIDED_P (first_element))
1951                     DR_GROUP_GAP (vinfo) = 0;
1952                   else
1953                     DR_GROUP_GAP (vinfo) = group_size - 1;
1954                   vinfo = next;
1955                 }
1956             }
1957         }
1958     }
1959 }
1960
1961
1962 /* Decides whether we need to create an epilogue loop to handle
1963    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1964
1965 void
1966 determine_peel_for_niter (loop_vec_info loop_vinfo)
1967 {
1968   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1969
1970   unsigned HOST_WIDE_INT const_vf;
1971   HOST_WIDE_INT max_niter
1972     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1973
1974   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1975   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1976     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1977                                           (loop_vinfo));
1978
1979   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1980     /* The main loop handles all iterations.  */
1981     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1982   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1983            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1984     {
1985       /* Work out the (constant) number of iterations that need to be
1986          peeled for reasons other than niters.  */
1987       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1988       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1989         peel_niter += 1;
1990       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1991                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1992         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1993     }
1994   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1995            /* ??? When peeling for gaps but not alignment, we could
1996               try to check whether the (variable) niters is known to be
1997               VF * N + 1.  That's something of a niche case though.  */
1998            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2000            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2001                 < (unsigned) exact_log2 (const_vf))
2002                /* In case of versioning, check if the maximum number of
2003                   iterations is greater than th.  If they are identical,
2004                   the epilogue is unnecessary.  */
2005                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2006                    || ((unsigned HOST_WIDE_INT) max_niter
2007                        > (th / const_vf) * const_vf))))
2008     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2009 }
2010
2011
2012 /* Function vect_analyze_loop_2.
2013
2014    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2015    for it.  The different analyses will record information in the
2016    loop_vec_info struct.  */
2017 static opt_result
2018 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2019 {
2020   opt_result ok = opt_result::success ();
2021   int res;
2022   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2023   poly_uint64 min_vf = 2;
2024   loop_vec_info orig_loop_vinfo = NULL;
2025
2026   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2027      loop_vec_info of the first vectorized loop.  */
2028   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2029     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2030   else
2031     orig_loop_vinfo = loop_vinfo;
2032   gcc_assert (orig_loop_vinfo);
2033
2034   /* The first group of checks is independent of the vector size.  */
2035   fatal = true;
2036
2037   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2038       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2039     return opt_result::failure_at (vect_location,
2040                                    "not vectorized: simd if(0)\n");
2041
2042   /* Find all data references in the loop (which correspond to vdefs/vuses)
2043      and analyze their evolution in the loop.  */
2044
2045   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2046
2047   /* Gather the data references and count stmts in the loop.  */
2048   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2049     {
2050       opt_result res
2051         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2052                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2053                                      n_stmts);
2054       if (!res)
2055         {
2056           if (dump_enabled_p ())
2057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058                              "not vectorized: loop contains function "
2059                              "calls or data references that cannot "
2060                              "be analyzed\n");
2061           return res;
2062         }
2063       loop_vinfo->shared->save_datarefs ();
2064     }
2065   else
2066     loop_vinfo->shared->check_datarefs ();
2067
2068   /* Analyze the data references and also adjust the minimal
2069      vectorization factor according to the loads and stores.  */
2070
2071   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2072   if (!ok)
2073     {
2074       if (dump_enabled_p ())
2075         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076                          "bad data references.\n");
2077       return ok;
2078     }
2079
2080   /* Classify all cross-iteration scalar data-flow cycles.
2081      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2082   vect_analyze_scalar_cycles (loop_vinfo);
2083
2084   vect_pattern_recog (loop_vinfo);
2085
2086   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2087
2088   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2089      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2090
2091   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2092   if (!ok)
2093     {
2094       if (dump_enabled_p ())
2095         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096                          "bad data access.\n");
2097       return ok;
2098     }
2099
2100   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2101
2102   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2103   if (!ok)
2104     {
2105       if (dump_enabled_p ())
2106         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107                          "unexpected pattern.\n");
2108       return ok;
2109     }
2110
2111   /* While the rest of the analysis below depends on it in some way.  */
2112   fatal = false;
2113
2114   /* Analyze data dependences between the data-refs in the loop
2115      and adjust the maximum vectorization factor according to
2116      the dependences.
2117      FORNOW: fail at the first data dependence that we encounter.  */
2118
2119   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2120   if (!ok)
2121     {
2122       if (dump_enabled_p ())
2123         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124                          "bad data dependence.\n");
2125       return ok;
2126     }
2127   if (max_vf != MAX_VECTORIZATION_FACTOR
2128       && maybe_lt (max_vf, min_vf))
2129     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2130   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2131
2132   ok = vect_determine_vectorization_factor (loop_vinfo);
2133   if (!ok)
2134     {
2135       if (dump_enabled_p ())
2136         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137                          "can't determine vectorization factor.\n");
2138       return ok;
2139     }
2140   if (max_vf != MAX_VECTORIZATION_FACTOR
2141       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2142     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2143
2144   /* Compute the scalar iteration cost.  */
2145   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2146
2147   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2148
2149   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2150   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2151   if (!ok)
2152     return ok;
2153
2154   /* If there are any SLP instances mark them as pure_slp.  */
2155   bool slp = vect_make_slp_decision (loop_vinfo);
2156   if (slp)
2157     {
2158       /* Find stmts that need to be both vectorized and SLPed.  */
2159       vect_detect_hybrid_slp (loop_vinfo);
2160
2161       /* Update the vectorization factor based on the SLP decision.  */
2162       vect_update_vf_for_slp (loop_vinfo);
2163
2164       /* Optimize the SLP graph with the vectorization factor fixed.  */
2165       vect_optimize_slp (loop_vinfo);
2166     }
2167
2168   bool saved_can_use_partial_vectors_p
2169     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2170
2171   /* We don't expect to have to roll back to anything other than an empty
2172      set of rgroups.  */
2173   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2174
2175   /* This is the point where we can re-start analysis with SLP forced off.  */
2176 start_over:
2177
2178   /* Now the vectorization factor is final.  */
2179   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2180   gcc_assert (known_ne (vectorization_factor, 0U));
2181
2182   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2183     {
2184       dump_printf_loc (MSG_NOTE, vect_location,
2185                        "vectorization_factor = ");
2186       dump_dec (MSG_NOTE, vectorization_factor);
2187       dump_printf (MSG_NOTE, ", niters = %wd\n",
2188                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2189     }
2190
2191   /* Analyze the alignment of the data-refs in the loop.
2192      Fail if a data reference is found that cannot be vectorized.  */
2193
2194   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2195   if (!ok)
2196     {
2197       if (dump_enabled_p ())
2198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199                          "bad data alignment.\n");
2200       return ok;
2201     }
2202
2203   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2204      It is important to call pruning after vect_analyze_data_ref_accesses,
2205      since we use grouping information gathered by interleaving analysis.  */
2206   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2207   if (!ok)
2208     return ok;
2209
2210   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2211      vectorization, since we do not want to add extra peeling or
2212      add versioning for alignment.  */
2213   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2214     /* This pass will decide on using loop versioning and/or loop peeling in
2215        order to enhance the alignment of data references in the loop.  */
2216     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2217   if (!ok)
2218     return ok;
2219
2220   if (slp)
2221     {
2222       /* Analyze operations in the SLP instances.  Note this may
2223          remove unsupported SLP instances which makes the above
2224          SLP kind detection invalid.  */
2225       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2226       vect_slp_analyze_operations (loop_vinfo);
2227       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2228         {
2229           ok = opt_result::failure_at (vect_location,
2230                                        "unsupported SLP instances\n");
2231           goto again;
2232         }
2233     }
2234
2235   /* Dissolve SLP-only groups.  */
2236   vect_dissolve_slp_only_groups (loop_vinfo);
2237
2238   /* Scan all the remaining operations in the loop that are not subject
2239      to SLP and make sure they are vectorizable.  */
2240   ok = vect_analyze_loop_operations (loop_vinfo);
2241   if (!ok)
2242     {
2243       if (dump_enabled_p ())
2244         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245                          "bad operation or unsupported loop bound.\n");
2246       return ok;
2247     }
2248
2249   /* For now, we don't expect to mix both masking and length approaches for one
2250      loop, disable it if both are recorded.  */
2251   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2252       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2253       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2254     {
2255       if (dump_enabled_p ())
2256         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257                          "can't vectorize a loop with partial vectors"
2258                          " because we don't expect to mix different"
2259                          " approaches with partial vectors for the"
2260                          " same loop.\n");
2261       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2262     }
2263
2264   /* Decide whether to vectorize a loop with partial vectors for
2265      this vectorization factor.  */
2266   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2267     {
2268       if (param_vect_partial_vector_usage == 0)
2269         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2270       else if (vect_verify_full_masking (loop_vinfo)
2271                || vect_verify_loop_lens (loop_vinfo))
2272         {
2273           /* The epilogue and other known niters less than VF
2274             cases can still use vector access with length fully.  */
2275           if (param_vect_partial_vector_usage == 1
2276               && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2277               && !vect_known_niters_smaller_than_vf (loop_vinfo))
2278             {
2279               LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2280               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2281             }
2282           else
2283             LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2284         }
2285       else
2286         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2287     }
2288   else
2289     LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2290
2291   if (dump_enabled_p ())
2292     {
2293       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294         dump_printf_loc (MSG_NOTE, vect_location,
2295                          "operating on partial vectors.\n");
2296       else
2297         dump_printf_loc (MSG_NOTE, vect_location,
2298                          "operating only on full vectors.\n");
2299     }
2300
2301   /* If epilog loop is required because of data accesses with gaps,
2302      one additional iteration needs to be peeled.  Check if there is
2303      enough iterations for vectorization.  */
2304   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2305       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2306       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2307     {
2308       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2309       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2310
2311       if (known_lt (wi::to_widest (scalar_niters), vf))
2312         return opt_result::failure_at (vect_location,
2313                                        "loop has no enough iterations to"
2314                                        " support peeling for gaps.\n");
2315     }
2316
2317   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2318      to be able to handle fewer than VF scalars, or needs to have a lower VF
2319      than the main loop.  */
2320   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2321       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2322       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2323                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2324     return opt_result::failure_at (vect_location,
2325                                    "Vectorization factor too high for"
2326                                    " epilogue loop.\n");
2327
2328   /* Check the costings of the loop make vectorizing worthwhile.  */
2329   res = vect_analyze_loop_costing (loop_vinfo);
2330   if (res < 0)
2331     {
2332       ok = opt_result::failure_at (vect_location,
2333                                    "Loop costings may not be worthwhile.\n");
2334       goto again;
2335     }
2336   if (!res)
2337     return opt_result::failure_at (vect_location,
2338                                    "Loop costings not worthwhile.\n");
2339
2340   determine_peel_for_niter (loop_vinfo);
2341   /* If an epilogue loop is required make sure we can create one.  */
2342   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2344     {
2345       if (dump_enabled_p ())
2346         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2347       if (!vect_can_advance_ivs_p (loop_vinfo)
2348           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2349                                            single_exit (LOOP_VINFO_LOOP
2350                                                          (loop_vinfo))))
2351         {
2352           ok = opt_result::failure_at (vect_location,
2353                                        "not vectorized: can't create required "
2354                                        "epilog loop\n");
2355           goto again;
2356         }
2357     }
2358
2359   /* During peeling, we need to check if number of loop iterations is
2360      enough for both peeled prolog loop and vector loop.  This check
2361      can be merged along with threshold check of loop versioning, so
2362      increase threshold for this case if necessary.
2363
2364      If we are analyzing an epilogue we still want to check what its
2365      versioning threshold would be.  If we decide to vectorize the epilogues we
2366      will want to use the lowest versioning threshold of all epilogues and main
2367      loop.  This will enable us to enter a vectorized epilogue even when
2368      versioning the loop.  We can't simply check whether the epilogue requires
2369      versioning though since we may have skipped some versioning checks when
2370      analyzing the epilogue.  For instance, checks for alias versioning will be
2371      skipped when dealing with epilogues as we assume we already checked them
2372      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2373   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2374     {
2375       poly_uint64 niters_th = 0;
2376       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2377
2378       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2379         {
2380           /* Niters for peeled prolog loop.  */
2381           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2382             {
2383               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2384               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2385               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2386             }
2387           else
2388             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2389         }
2390
2391       /* Niters for at least one iteration of vectorized loop.  */
2392       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2393         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2394       /* One additional iteration because of peeling for gap.  */
2395       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2396         niters_th += 1;
2397
2398       /*  Use the same condition as vect_transform_loop to decide when to use
2399           the cost to determine a versioning threshold.  */
2400       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2401           && ordered_p (th, niters_th))
2402         niters_th = ordered_max (poly_uint64 (th), niters_th);
2403
2404       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2405     }
2406
2407   gcc_assert (known_eq (vectorization_factor,
2408                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2409
2410   /* Ok to vectorize!  */
2411   return opt_result::success ();
2412
2413 again:
2414   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2415   gcc_assert (!ok);
2416
2417   /* Try again with SLP forced off but if we didn't do any SLP there is
2418      no point in re-trying.  */
2419   if (!slp)
2420     return ok;
2421
2422   /* If there are reduction chains re-trying will fail anyway.  */
2423   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2424     return ok;
2425
2426   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2427      via interleaving or lane instructions.  */
2428   slp_instance instance;
2429   slp_tree node;
2430   unsigned i, j;
2431   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2432     {
2433       stmt_vec_info vinfo;
2434       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2435       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2436         continue;
2437       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2438       unsigned int size = DR_GROUP_SIZE (vinfo);
2439       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2440       if (! vect_store_lanes_supported (vectype, size, false)
2441          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2442          && ! vect_grouped_store_supported (vectype, size))
2443         return opt_result::failure_at (vinfo->stmt,
2444                                        "unsupported grouped store\n");
2445       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2446         {
2447           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2448           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2449           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2450           size = DR_GROUP_SIZE (vinfo);
2451           vectype = STMT_VINFO_VECTYPE (vinfo);
2452           if (! vect_load_lanes_supported (vectype, size, false)
2453               && ! vect_grouped_load_supported (vectype, single_element_p,
2454                                                 size))
2455             return opt_result::failure_at (vinfo->stmt,
2456                                            "unsupported grouped load\n");
2457         }
2458     }
2459
2460   if (dump_enabled_p ())
2461     dump_printf_loc (MSG_NOTE, vect_location,
2462                      "re-trying with SLP disabled\n");
2463
2464   /* Roll back state appropriately.  No SLP this time.  */
2465   slp = false;
2466   /* Restore vectorization factor as it were without SLP.  */
2467   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2468   /* Free the SLP instances.  */
2469   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2470     vect_free_slp_instance (instance, false);
2471   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2472   /* Reset SLP type to loop_vect on all stmts.  */
2473   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2474     {
2475       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2476       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2477            !gsi_end_p (si); gsi_next (&si))
2478         {
2479           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2480           STMT_SLP_TYPE (stmt_info) = loop_vect;
2481           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2482               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2483             {
2484               /* vectorizable_reduction adjusts reduction stmt def-types,
2485                  restore them to that of the PHI.  */
2486               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2487                 = STMT_VINFO_DEF_TYPE (stmt_info);
2488               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2489                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2490                 = STMT_VINFO_DEF_TYPE (stmt_info);
2491             }
2492         }
2493       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2494            !gsi_end_p (si); gsi_next (&si))
2495         {
2496           if (is_gimple_debug (gsi_stmt (si)))
2497             continue;
2498           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2499           STMT_SLP_TYPE (stmt_info) = loop_vect;
2500           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2501             {
2502               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2503               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2504               STMT_SLP_TYPE (stmt_info) = loop_vect;
2505               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2506                    !gsi_end_p (pi); gsi_next (&pi))
2507                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2508                   = loop_vect;
2509             }
2510         }
2511     }
2512   /* Free optimized alias test DDRS.  */
2513   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2514   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2515   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2516   /* Reset target cost data.  */
2517   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2518   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2519     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2520   /* Reset accumulated rgroup information.  */
2521   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2522   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2523   /* Reset assorted flags.  */
2524   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2525   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2526   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2527   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2528   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2529     = saved_can_use_partial_vectors_p;
2530
2531   goto start_over;
2532 }
2533
2534 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2535    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2536    OLD_LOOP_VINFO is better unless something specifically indicates
2537    otherwise.
2538
2539    Note that this deliberately isn't a partial order.  */
2540
2541 static bool
2542 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2543                           loop_vec_info old_loop_vinfo)
2544 {
2545   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2546   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2547
2548   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2549   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2550
2551   /* Always prefer a VF of loop->simdlen over any other VF.  */
2552   if (loop->simdlen)
2553     {
2554       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2555       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2556       if (new_simdlen_p != old_simdlen_p)
2557         return new_simdlen_p;
2558     }
2559
2560   /* Limit the VFs to what is likely to be the maximum number of iterations,
2561      to handle cases in which at least one loop_vinfo is fully-masked.  */
2562   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2563   if (estimated_max_niter != -1)
2564     {
2565       if (known_le (estimated_max_niter, new_vf))
2566         new_vf = estimated_max_niter;
2567       if (known_le (estimated_max_niter, old_vf))
2568         old_vf = estimated_max_niter;
2569     }
2570
2571   /* Check whether the (fractional) cost per scalar iteration is lower
2572      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2573   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2574                              * poly_widest_int (old_vf));
2575   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2576                              * poly_widest_int (new_vf));
2577   if (maybe_lt (rel_old, rel_new))
2578     {
2579       /* When old_loop_vinfo uses a variable vectorization factor,
2580          we know that it has a lower cost for at least one runtime VF.
2581          However, we don't know how likely that VF is.
2582
2583          One option would be to compare the costs for the estimated VFs.
2584          The problem is that that can put too much pressure on the cost
2585          model.  E.g. if the estimated VF is also the lowest possible VF,
2586          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2587          for the estimated VF, we'd then choose new_loop_vinfo even
2588          though (a) new_loop_vinfo might not actually be better than
2589          old_loop_vinfo for that VF and (b) it would be significantly
2590          worse at larger VFs.
2591
2592          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2593          no more expensive than old_loop_vinfo even after doubling the
2594          estimated old_loop_vinfo VF.  For all but trivial loops, this
2595          ensures that we only pick new_loop_vinfo if it is significantly
2596          better than old_loop_vinfo at the estimated VF.  */
2597       if (rel_new.is_constant ())
2598         return false;
2599
2600       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2601       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2602       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2603                                       * widest_int (old_estimated_vf));
2604       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2605                                       * widest_int (new_estimated_vf));
2606       return estimated_rel_new * 2 <= estimated_rel_old;
2607     }
2608   if (known_lt (rel_new, rel_old))
2609     return true;
2610
2611   /* If there's nothing to choose between the loop bodies, see whether
2612      there's a difference in the prologue and epilogue costs.  */
2613   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2614     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2615
2616   return false;
2617 }
2618
2619 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2620    true if we should.  */
2621
2622 static bool
2623 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2624                         loop_vec_info old_loop_vinfo)
2625 {
2626   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2627     return false;
2628
2629   if (dump_enabled_p ())
2630     dump_printf_loc (MSG_NOTE, vect_location,
2631                      "***** Preferring vector mode %s to vector mode %s\n",
2632                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2633                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2634   return true;
2635 }
2636
2637 /* Function vect_analyze_loop.
2638
2639    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2640    for it.  The different analyses will record information in the
2641    loop_vec_info struct.  */
2642 opt_loop_vec_info
2643 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2644 {
2645   auto_vector_modes vector_modes;
2646
2647   /* Autodetect first vector size we try.  */
2648   unsigned int autovec_flags
2649     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2650                                                     loop->simdlen != 0);
2651   unsigned int mode_i = 0;
2652
2653   DUMP_VECT_SCOPE ("analyze_loop_nest");
2654
2655   if (loop_outer (loop)
2656       && loop_vec_info_for_loop (loop_outer (loop))
2657       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2658     return opt_loop_vec_info::failure_at (vect_location,
2659                                           "outer-loop already vectorized.\n");
2660
2661   if (!find_loop_nest (loop, &shared->loop_nest))
2662     return opt_loop_vec_info::failure_at
2663       (vect_location,
2664        "not vectorized: loop nest containing two or more consecutive inner"
2665        " loops cannot be vectorized\n");
2666
2667   unsigned n_stmts = 0;
2668   machine_mode autodetected_vector_mode = VOIDmode;
2669   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2670   machine_mode next_vector_mode = VOIDmode;
2671   poly_uint64 lowest_th = 0;
2672   unsigned vectorized_loops = 0;
2673   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2674                              && !unlimited_cost_model (loop));
2675
2676   bool vect_epilogues = false;
2677   opt_result res = opt_result::success ();
2678   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2679   while (1)
2680     {
2681       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2682       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2683       if (!loop_vinfo)
2684         {
2685           if (dump_enabled_p ())
2686             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2687                              "bad loop form.\n");
2688           gcc_checking_assert (first_loop_vinfo == NULL);
2689           return loop_vinfo;
2690         }
2691       loop_vinfo->vector_mode = next_vector_mode;
2692
2693       bool fatal = false;
2694
2695       /* When pick_lowest_cost_p is true, we should in principle iterate
2696          over all the loop_vec_infos that LOOP_VINFO could replace and
2697          try to vectorize LOOP_VINFO under the same conditions.
2698          E.g. when trying to replace an epilogue loop, we should vectorize
2699          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2700          to replace the main loop, we should vectorize LOOP_VINFO as a main
2701          loop too.
2702
2703          However, autovectorize_vector_modes is usually sorted as follows:
2704
2705          - Modes that naturally produce lower VFs usually follow modes that
2706            naturally produce higher VFs.
2707
2708          - When modes naturally produce the same VF, maskable modes
2709            usually follow unmaskable ones, so that the maskable mode
2710            can be used to vectorize the epilogue of the unmaskable mode.
2711
2712          This order is preferred because it leads to the maximum
2713          epilogue vectorization opportunities.  Targets should only use
2714          a different order if they want to make wide modes available while
2715          disparaging them relative to earlier, smaller modes.  The assumption
2716          in that case is that the wider modes are more expensive in some
2717          way that isn't reflected directly in the costs.
2718
2719          There should therefore be few interesting cases in which
2720          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2721          treated as a standalone loop, and ends up being genuinely cheaper
2722          than FIRST_LOOP_VINFO.  */
2723       if (vect_epilogues)
2724         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2725
2726       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2727       if (mode_i == 0)
2728         autodetected_vector_mode = loop_vinfo->vector_mode;
2729       if (dump_enabled_p ())
2730         {
2731           if (res)
2732             dump_printf_loc (MSG_NOTE, vect_location,
2733                              "***** Analysis succeeded with vector mode %s\n",
2734                              GET_MODE_NAME (loop_vinfo->vector_mode));
2735           else
2736             dump_printf_loc (MSG_NOTE, vect_location,
2737                              "***** Analysis failed with vector mode %s\n",
2738                              GET_MODE_NAME (loop_vinfo->vector_mode));
2739         }
2740
2741       loop->aux = NULL;
2742
2743       if (!fatal)
2744         while (mode_i < vector_modes.length ()
2745                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2746           {
2747             if (dump_enabled_p ())
2748               dump_printf_loc (MSG_NOTE, vect_location,
2749                                "***** The result for vector mode %s would"
2750                                " be the same\n",
2751                                GET_MODE_NAME (vector_modes[mode_i]));
2752             mode_i += 1;
2753           }
2754
2755       if (res)
2756         {
2757           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2758           vectorized_loops++;
2759
2760           /* Once we hit the desired simdlen for the first time,
2761              discard any previous attempts.  */
2762           if (simdlen
2763               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2764             {
2765               delete first_loop_vinfo;
2766               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2767               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2768               simdlen = 0;
2769             }
2770           else if (pick_lowest_cost_p && first_loop_vinfo)
2771             {
2772               /* Keep trying to roll back vectorization attempts while the
2773                  loop_vec_infos they produced were worse than this one.  */
2774               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2775               while (!vinfos.is_empty ()
2776                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2777                 {
2778                   gcc_assert (vect_epilogues);
2779                   delete vinfos.pop ();
2780                 }
2781               if (vinfos.is_empty ()
2782                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2783                 {
2784                   delete first_loop_vinfo;
2785                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2786                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2787                 }
2788             }
2789
2790           if (first_loop_vinfo == NULL)
2791             {
2792               first_loop_vinfo = loop_vinfo;
2793               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2794             }
2795           else if (vect_epilogues
2796                    /* For now only allow one epilogue loop.  */
2797                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2798             {
2799               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2800               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2801               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2802                           || maybe_ne (lowest_th, 0U));
2803               /* Keep track of the known smallest versioning
2804                  threshold.  */
2805               if (ordered_p (lowest_th, th))
2806                 lowest_th = ordered_min (lowest_th, th);
2807             }
2808           else
2809             {
2810               delete loop_vinfo;
2811               loop_vinfo = opt_loop_vec_info::success (NULL);
2812             }
2813
2814           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2815              enabled, SIMDUID is not set, it is the innermost loop and we have
2816              either already found the loop's SIMDLEN or there was no SIMDLEN to
2817              begin with.
2818              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2819           vect_epilogues = (!simdlen
2820                             && loop->inner == NULL
2821                             && param_vect_epilogues_nomask
2822                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2823                             && !loop->simduid
2824                             /* For now only allow one epilogue loop, but allow
2825                                pick_lowest_cost_p to replace it.  */
2826                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2827                                 || pick_lowest_cost_p));
2828
2829           /* Commit to first_loop_vinfo if we have no reason to try
2830              alternatives.  */
2831           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2832             break;
2833         }
2834       else
2835         {
2836           delete loop_vinfo;
2837           loop_vinfo = opt_loop_vec_info::success (NULL);
2838           if (fatal)
2839             {
2840               gcc_checking_assert (first_loop_vinfo == NULL);
2841               break;
2842             }
2843         }
2844
2845       /* Handle the case that the original loop can use partial
2846          vectorization, but want to only adopt it for the epilogue.
2847          The retry should be in the same mode as original.  */
2848       if (vect_epilogues
2849           && loop_vinfo
2850           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2851         {
2852           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2853                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2854           if (dump_enabled_p ())
2855             dump_printf_loc (MSG_NOTE, vect_location,
2856                              "***** Re-trying analysis with same vector mode"
2857                              " %s for epilogue with partial vectors.\n",
2858                              GET_MODE_NAME (loop_vinfo->vector_mode));
2859           continue;
2860         }
2861
2862       if (mode_i < vector_modes.length ()
2863           && VECTOR_MODE_P (autodetected_vector_mode)
2864           && (related_vector_mode (vector_modes[mode_i],
2865                                    GET_MODE_INNER (autodetected_vector_mode))
2866               == autodetected_vector_mode)
2867           && (related_vector_mode (autodetected_vector_mode,
2868                                    GET_MODE_INNER (vector_modes[mode_i]))
2869               == vector_modes[mode_i]))
2870         {
2871           if (dump_enabled_p ())
2872             dump_printf_loc (MSG_NOTE, vect_location,
2873                              "***** Skipping vector mode %s, which would"
2874                              " repeat the analysis for %s\n",
2875                              GET_MODE_NAME (vector_modes[mode_i]),
2876                              GET_MODE_NAME (autodetected_vector_mode));
2877           mode_i += 1;
2878         }
2879
2880       if (mode_i == vector_modes.length ()
2881           || autodetected_vector_mode == VOIDmode)
2882         break;
2883
2884       /* Try the next biggest vector size.  */
2885       next_vector_mode = vector_modes[mode_i++];
2886       if (dump_enabled_p ())
2887         dump_printf_loc (MSG_NOTE, vect_location,
2888                          "***** Re-trying analysis with vector mode %s\n",
2889                          GET_MODE_NAME (next_vector_mode));
2890     }
2891
2892   if (first_loop_vinfo)
2893     {
2894       loop->aux = (loop_vec_info) first_loop_vinfo;
2895       if (dump_enabled_p ())
2896         dump_printf_loc (MSG_NOTE, vect_location,
2897                          "***** Choosing vector mode %s\n",
2898                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2899       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2900       return first_loop_vinfo;
2901     }
2902
2903   return opt_loop_vec_info::propagate_failure (res);
2904 }
2905
2906 /* Return true if there is an in-order reduction function for CODE, storing
2907    it in *REDUC_FN if so.  */
2908
2909 static bool
2910 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2911 {
2912   switch (code)
2913     {
2914     case PLUS_EXPR:
2915       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2916       return true;
2917
2918     default:
2919       return false;
2920     }
2921 }
2922
2923 /* Function reduction_fn_for_scalar_code
2924
2925    Input:
2926    CODE - tree_code of a reduction operations.
2927
2928    Output:
2929    REDUC_FN - the corresponding internal function to be used to reduce the
2930       vector of partial results into a single scalar result, or IFN_LAST
2931       if the operation is a supported reduction operation, but does not have
2932       such an internal function.
2933
2934    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2935
2936 static bool
2937 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2938 {
2939   switch (code)
2940     {
2941       case MAX_EXPR:
2942         *reduc_fn = IFN_REDUC_MAX;
2943         return true;
2944
2945       case MIN_EXPR:
2946         *reduc_fn = IFN_REDUC_MIN;
2947         return true;
2948
2949       case PLUS_EXPR:
2950         *reduc_fn = IFN_REDUC_PLUS;
2951         return true;
2952
2953       case BIT_AND_EXPR:
2954         *reduc_fn = IFN_REDUC_AND;
2955         return true;
2956
2957       case BIT_IOR_EXPR:
2958         *reduc_fn = IFN_REDUC_IOR;
2959         return true;
2960
2961       case BIT_XOR_EXPR:
2962         *reduc_fn = IFN_REDUC_XOR;
2963         return true;
2964
2965       case MULT_EXPR:
2966       case MINUS_EXPR:
2967         *reduc_fn = IFN_LAST;
2968         return true;
2969
2970       default:
2971        return false;
2972     }
2973 }
2974
2975 /* If there is a neutral value X such that SLP reduction NODE would not
2976    be affected by the introduction of additional X elements, return that X,
2977    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2978    is the vector type that would hold element X.  REDUC_CHAIN is true if
2979    the SLP statements perform a single reduction, false if each statement
2980    performs an independent reduction.  */
2981
2982 static tree
2983 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2984                               tree_code code, bool reduc_chain)
2985 {
2986   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2987   stmt_vec_info stmt_vinfo = stmts[0];
2988   tree scalar_type = TREE_TYPE (vector_type);
2989   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2990   gcc_assert (loop);
2991
2992   switch (code)
2993     {
2994     case WIDEN_SUM_EXPR:
2995     case DOT_PROD_EXPR:
2996     case SAD_EXPR:
2997     case PLUS_EXPR:
2998     case MINUS_EXPR:
2999     case BIT_IOR_EXPR:
3000     case BIT_XOR_EXPR:
3001       return build_zero_cst (scalar_type);
3002
3003     case MULT_EXPR:
3004       return build_one_cst (scalar_type);
3005
3006     case BIT_AND_EXPR:
3007       return build_all_ones_cst (scalar_type);
3008
3009     case MAX_EXPR:
3010     case MIN_EXPR:
3011       /* For MIN/MAX the initial values are neutral.  A reduction chain
3012          has only a single initial value, so that value is neutral for
3013          all statements.  */
3014       if (reduc_chain)
3015         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3016                                       loop_preheader_edge (loop));
3017       return NULL_TREE;
3018
3019     default:
3020       return NULL_TREE;
3021     }
3022 }
3023
3024 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3025    STMT is printed with a message MSG. */
3026
3027 static void
3028 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3029 {
3030   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3031 }
3032
3033 /* Return true if we need an in-order reduction for operation CODE
3034    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3035    overflow must wrap.  */
3036
3037 bool
3038 needs_fold_left_reduction_p (tree type, tree_code code)
3039 {
3040   /* CHECKME: check for !flag_finite_math_only too?  */
3041   if (SCALAR_FLOAT_TYPE_P (type))
3042     switch (code)
3043       {
3044       case MIN_EXPR:
3045       case MAX_EXPR:
3046         return false;
3047
3048       default:
3049         return !flag_associative_math;
3050       }
3051
3052   if (INTEGRAL_TYPE_P (type))
3053     {
3054       if (!operation_no_trapping_overflow (type, code))
3055         return true;
3056       return false;
3057     }
3058
3059   if (SAT_FIXED_POINT_TYPE_P (type))
3060     return true;
3061
3062   return false;
3063 }
3064
3065 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3066    has a handled computation expression.  Store the main reduction
3067    operation in *CODE.  */
3068
3069 static bool
3070 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3071                       tree loop_arg, enum tree_code *code,
3072                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3073 {
3074   auto_bitmap visited;
3075   tree lookfor = PHI_RESULT (phi);
3076   ssa_op_iter curri;
3077   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3078   while (USE_FROM_PTR (curr) != loop_arg)
3079     curr = op_iter_next_use (&curri);
3080   curri.i = curri.numops;
3081   do
3082     {
3083       path.safe_push (std::make_pair (curri, curr));
3084       tree use = USE_FROM_PTR (curr);
3085       if (use == lookfor)
3086         break;
3087       gimple *def = SSA_NAME_DEF_STMT (use);
3088       if (gimple_nop_p (def)
3089           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3090         {
3091 pop:
3092           do
3093             {
3094               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3095               curri = x.first;
3096               curr = x.second;
3097               do
3098                 curr = op_iter_next_use (&curri);
3099               /* Skip already visited or non-SSA operands (from iterating
3100                  over PHI args).  */
3101               while (curr != NULL_USE_OPERAND_P
3102                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3103                          || ! bitmap_set_bit (visited,
3104                                               SSA_NAME_VERSION
3105                                                 (USE_FROM_PTR (curr)))));
3106             }
3107           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3108           if (curr == NULL_USE_OPERAND_P)
3109             break;
3110         }
3111       else
3112         {
3113           if (gimple_code (def) == GIMPLE_PHI)
3114             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3115           else
3116             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3117           while (curr != NULL_USE_OPERAND_P
3118                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3119                      || ! bitmap_set_bit (visited,
3120                                           SSA_NAME_VERSION
3121                                             (USE_FROM_PTR (curr)))))
3122             curr = op_iter_next_use (&curri);
3123           if (curr == NULL_USE_OPERAND_P)
3124             goto pop;
3125         }
3126     }
3127   while (1);
3128   if (dump_file && (dump_flags & TDF_DETAILS))
3129     {
3130       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3131       unsigned i;
3132       std::pair<ssa_op_iter, use_operand_p> *x;
3133       FOR_EACH_VEC_ELT (path, i, x)
3134         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3135       dump_printf (MSG_NOTE, "\n");
3136     }
3137
3138   /* Check whether the reduction path detected is valid.  */
3139   bool fail = path.length () == 0;
3140   bool neg = false;
3141   int sign = -1;
3142   *code = ERROR_MARK;
3143   for (unsigned i = 1; i < path.length (); ++i)
3144     {
3145       gimple *use_stmt = USE_STMT (path[i].second);
3146       tree op = USE_FROM_PTR (path[i].second);
3147       if (! is_gimple_assign (use_stmt)
3148           /* The following make sure we can compute the operand index
3149              easily plus it mostly disallows chaining via COND_EXPR condition
3150              operands.  */
3151           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3152               && (gimple_num_ops (use_stmt) <= 2
3153                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3154               && (gimple_num_ops (use_stmt) <= 3
3155                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3156         {
3157           fail = true;
3158           break;
3159         }
3160       /* Check there's only a single stmt the op is used on inside
3161          of the loop.  */
3162       imm_use_iterator imm_iter;
3163       gimple *op_use_stmt;
3164       unsigned cnt = 0;
3165       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3166         if (!is_gimple_debug (op_use_stmt)
3167             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3168           {
3169             /* We want to allow x + x but not x < 1 ? x : 2.  */
3170             if (is_gimple_assign (op_use_stmt)
3171                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3172               {
3173                 use_operand_p use_p;
3174                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3175                   cnt++;
3176               }
3177             else
3178               cnt++;
3179           }
3180       if (cnt != 1)
3181         {
3182           fail = true;
3183           break;
3184         }
3185       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3186       if (use_code == MINUS_EXPR)
3187         {
3188           use_code = PLUS_EXPR;
3189           /* Track whether we negate the reduction value each iteration.  */
3190           if (gimple_assign_rhs2 (use_stmt) == op)
3191             neg = ! neg;
3192         }
3193       if (CONVERT_EXPR_CODE_P (use_code)
3194           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3195                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3196         ;
3197       else if (*code == ERROR_MARK)
3198         {
3199           *code = use_code;
3200           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3201         }
3202       else if (use_code != *code)
3203         {
3204           fail = true;
3205           break;
3206         }
3207       else if ((use_code == MIN_EXPR
3208                 || use_code == MAX_EXPR)
3209                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3210         {
3211           fail = true;
3212           break;
3213         }
3214     }
3215   return ! fail && ! neg && *code != ERROR_MARK;
3216 }
3217
3218 bool
3219 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3220                       tree loop_arg, enum tree_code code)
3221 {
3222   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3223   enum tree_code code_;
3224   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3225           && code_ == code);
3226 }
3227
3228
3229
3230 /* Function vect_is_simple_reduction
3231
3232    (1) Detect a cross-iteration def-use cycle that represents a simple
3233    reduction computation.  We look for the following pattern:
3234
3235    loop_header:
3236      a1 = phi < a0, a2 >
3237      a3 = ...
3238      a2 = operation (a3, a1)
3239
3240    or
3241
3242    a3 = ...
3243    loop_header:
3244      a1 = phi < a0, a2 >
3245      a2 = operation (a3, a1)
3246
3247    such that:
3248    1. operation is commutative and associative and it is safe to
3249       change the order of the computation
3250    2. no uses for a2 in the loop (a2 is used out of the loop)
3251    3. no uses of a1 in the loop besides the reduction operation
3252    4. no uses of a1 outside the loop.
3253
3254    Conditions 1,4 are tested here.
3255    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3256
3257    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3258    nested cycles.
3259
3260    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3261    reductions:
3262
3263      a1 = phi < a0, a2 >
3264      inner loop (def of a3)
3265      a2 = phi < a3 >
3266
3267    (4) Detect condition expressions, ie:
3268      for (int i = 0; i < N; i++)
3269        if (a[i] < val)
3270         ret_val = a[i];
3271
3272 */
3273
3274 static stmt_vec_info
3275 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3276                           bool *double_reduc, bool *reduc_chain_p)
3277 {
3278   gphi *phi = as_a <gphi *> (phi_info->stmt);
3279   gimple *phi_use_stmt = NULL;
3280   imm_use_iterator imm_iter;
3281   use_operand_p use_p;
3282
3283   *double_reduc = false;
3284   *reduc_chain_p = false;
3285   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3286
3287   tree phi_name = PHI_RESULT (phi);
3288   /* ???  If there are no uses of the PHI result the inner loop reduction
3289      won't be detected as possibly double-reduction by vectorizable_reduction
3290      because that tries to walk the PHI arg from the preheader edge which
3291      can be constant.  See PR60382.  */
3292   if (has_zero_uses (phi_name))
3293     return NULL;
3294   class loop *loop = (gimple_bb (phi))->loop_father;
3295   unsigned nphi_def_loop_uses = 0;
3296   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3297     {
3298       gimple *use_stmt = USE_STMT (use_p);
3299       if (is_gimple_debug (use_stmt))
3300         continue;
3301
3302       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3303         {
3304           if (dump_enabled_p ())
3305             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3306                              "intermediate value used outside loop.\n");
3307
3308           return NULL;
3309         }
3310
3311       nphi_def_loop_uses++;
3312       phi_use_stmt = use_stmt;
3313     }
3314
3315   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3316   if (TREE_CODE (latch_def) != SSA_NAME)
3317     {
3318       if (dump_enabled_p ())
3319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320                          "reduction: not ssa_name: %T\n", latch_def);
3321       return NULL;
3322     }
3323
3324   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3325   if (!def_stmt_info
3326       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3327     return NULL;
3328
3329   bool nested_in_vect_loop
3330     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3331   unsigned nlatch_def_loop_uses = 0;
3332   auto_vec<gphi *, 3> lcphis;
3333   bool inner_loop_of_double_reduc = false;
3334   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3335     {
3336       gimple *use_stmt = USE_STMT (use_p);
3337       if (is_gimple_debug (use_stmt))
3338         continue;
3339       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3340         nlatch_def_loop_uses++;
3341       else
3342         {
3343           /* We can have more than one loop-closed PHI.  */
3344           lcphis.safe_push (as_a <gphi *> (use_stmt));
3345           if (nested_in_vect_loop
3346               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3347                   == vect_double_reduction_def))
3348             inner_loop_of_double_reduc = true;
3349         }
3350     }
3351
3352   /* If we are vectorizing an inner reduction we are executing that
3353      in the original order only in case we are not dealing with a
3354      double reduction.  */
3355   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3356     {
3357       if (dump_enabled_p ())
3358         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3359                         "detected nested cycle: ");
3360       return def_stmt_info;
3361     }
3362
3363   /* If this isn't a nested cycle or if the nested cycle reduction value
3364      is used ouside of the inner loop we cannot handle uses of the reduction
3365      value.  */
3366   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3367     {
3368       if (dump_enabled_p ())
3369         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3370                          "reduction used in loop.\n");
3371       return NULL;
3372     }
3373
3374   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3375      defined in the inner loop.  */
3376   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3377     {
3378       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3379       if (gimple_phi_num_args (def_stmt) != 1
3380           || TREE_CODE (op1) != SSA_NAME)
3381         {
3382           if (dump_enabled_p ())
3383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384                              "unsupported phi node definition.\n");
3385
3386           return NULL;
3387         }
3388
3389       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3390       if (gimple_bb (def1)
3391           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3392           && loop->inner
3393           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3394           && is_gimple_assign (def1)
3395           && is_a <gphi *> (phi_use_stmt)
3396           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3397         {
3398           if (dump_enabled_p ())
3399             report_vect_op (MSG_NOTE, def_stmt,
3400                             "detected double reduction: ");
3401
3402           *double_reduc = true;
3403           return def_stmt_info;
3404         }
3405
3406       return NULL;
3407     }
3408
3409   /* Look for the expression computing latch_def from then loop PHI result.  */
3410   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3411   enum tree_code code;
3412   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3413                             path))
3414     {
3415       STMT_VINFO_REDUC_CODE (phi_info) = code;
3416       if (code == COND_EXPR && !nested_in_vect_loop)
3417         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3418
3419       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3420          reduction chain for which the additional restriction is that
3421          all operations in the chain are the same.  */
3422       auto_vec<stmt_vec_info, 8> reduc_chain;
3423       unsigned i;
3424       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3425       for (i = path.length () - 1; i >= 1; --i)
3426         {
3427           gimple *stmt = USE_STMT (path[i].second);
3428           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3429           STMT_VINFO_REDUC_IDX (stmt_info)
3430             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3431           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3432           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3433                                      && (i == 1 || i == path.length () - 1));
3434           if ((stmt_code != code && !leading_conversion)
3435               /* We can only handle the final value in epilogue
3436                  generation for reduction chains.  */
3437               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3438             is_slp_reduc = false;
3439           /* For reduction chains we support a trailing/leading
3440              conversions.  We do not store those in the actual chain.  */
3441           if (leading_conversion)
3442             continue;
3443           reduc_chain.safe_push (stmt_info);
3444         }
3445       if (is_slp_reduc && reduc_chain.length () > 1)
3446         {
3447           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3448             {
3449               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3450               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3451             }
3452           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3453           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3454
3455           /* Save the chain for further analysis in SLP detection.  */
3456           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3457           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3458
3459           *reduc_chain_p = true;
3460           if (dump_enabled_p ())
3461             dump_printf_loc (MSG_NOTE, vect_location,
3462                             "reduction: detected reduction chain\n");
3463         }
3464       else if (dump_enabled_p ())
3465         dump_printf_loc (MSG_NOTE, vect_location,
3466                          "reduction: detected reduction\n");
3467
3468       return def_stmt_info;
3469     }
3470
3471   if (dump_enabled_p ())
3472     dump_printf_loc (MSG_NOTE, vect_location,
3473                      "reduction: unknown pattern\n");
3474
3475   return NULL;
3476 }
3477
3478 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3479    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3480    or -1 if not known.  */
3481
3482 static int
3483 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3484 {
3485   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3486   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3487     {
3488       if (dump_enabled_p ())
3489         dump_printf_loc (MSG_NOTE, vect_location,
3490                          "cost model: epilogue peel iters set to vf/2 "
3491                          "because loop iterations are unknown .\n");
3492       return assumed_vf / 2;
3493     }
3494   else
3495     {
3496       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3497       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3498       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3499       /* If we need to peel for gaps, but no peeling is required, we have to
3500          peel VF iterations.  */
3501       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3502         peel_iters_epilogue = assumed_vf;
3503       return peel_iters_epilogue;
3504     }
3505 }
3506
3507 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3508 int
3509 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3510                              int *peel_iters_epilogue,
3511                              stmt_vector_for_cost *scalar_cost_vec,
3512                              stmt_vector_for_cost *prologue_cost_vec,
3513                              stmt_vector_for_cost *epilogue_cost_vec)
3514 {
3515   int retval = 0;
3516
3517   *peel_iters_epilogue
3518     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3519
3520   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3521     {
3522       /* If peeled iterations are known but number of scalar loop
3523          iterations are unknown, count a taken branch per peeled loop.  */
3524       if (peel_iters_prologue > 0)
3525         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3526                                    NULL, NULL_TREE, 0, vect_prologue);
3527       if (*peel_iters_epilogue > 0)
3528         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3529                                     NULL, NULL_TREE, 0, vect_epilogue);
3530     }
3531
3532   stmt_info_for_cost *si;
3533   int j;
3534   if (peel_iters_prologue)
3535     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3536       retval += record_stmt_cost (prologue_cost_vec,
3537                                   si->count * peel_iters_prologue,
3538                                   si->kind, si->stmt_info, si->misalign,
3539                                   vect_prologue);
3540   if (*peel_iters_epilogue)
3541     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3542       retval += record_stmt_cost (epilogue_cost_vec,
3543                                   si->count * *peel_iters_epilogue,
3544                                   si->kind, si->stmt_info, si->misalign,
3545                                   vect_epilogue);
3546
3547   return retval;
3548 }
3549
3550 /* Function vect_estimate_min_profitable_iters
3551
3552    Return the number of iterations required for the vector version of the
3553    loop to be profitable relative to the cost of the scalar version of the
3554    loop.
3555
3556    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3557    of iterations for vectorization.  -1 value means loop vectorization
3558    is not profitable.  This returned value may be used for dynamic
3559    profitability check.
3560
3561    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3562    for static check against estimated number of iterations.  */
3563
3564 static void
3565 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3566                                     int *ret_min_profitable_niters,
3567                                     int *ret_min_profitable_estimate)
3568 {
3569   int min_profitable_iters;
3570   int min_profitable_estimate;
3571   int peel_iters_prologue;
3572   int peel_iters_epilogue;
3573   unsigned vec_inside_cost = 0;
3574   int vec_outside_cost = 0;
3575   unsigned vec_prologue_cost = 0;
3576   unsigned vec_epilogue_cost = 0;
3577   int scalar_single_iter_cost = 0;
3578   int scalar_outside_cost = 0;
3579   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3580   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3581   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3582
3583   /* Cost model disabled.  */
3584   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3585     {
3586       if (dump_enabled_p ())
3587         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3588       *ret_min_profitable_niters = 0;
3589       *ret_min_profitable_estimate = 0;
3590       return;
3591     }
3592
3593   /* Requires loop versioning tests to handle misalignment.  */
3594   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3595     {
3596       /*  FIXME: Make cost depend on complexity of individual check.  */
3597       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3598       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3599                             NULL, NULL_TREE, 0, vect_prologue);
3600       if (dump_enabled_p ())
3601         dump_printf (MSG_NOTE,
3602                      "cost model: Adding cost of checks for loop "
3603                      "versioning to treat misalignment.\n");
3604     }
3605
3606   /* Requires loop versioning with alias checks.  */
3607   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3608     {
3609       /*  FIXME: Make cost depend on complexity of individual check.  */
3610       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3611       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3612                             NULL, NULL_TREE, 0, vect_prologue);
3613       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3614       if (len)
3615         /* Count LEN - 1 ANDs and LEN comparisons.  */
3616         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3617                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3618       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3619       if (len)
3620         {
3621           /* Count LEN - 1 ANDs and LEN comparisons.  */
3622           unsigned int nstmts = len * 2 - 1;
3623           /* +1 for each bias that needs adding.  */
3624           for (unsigned int i = 0; i < len; ++i)
3625             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3626               nstmts += 1;
3627           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3628                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3629         }
3630       if (dump_enabled_p ())
3631         dump_printf (MSG_NOTE,
3632                      "cost model: Adding cost of checks for loop "
3633                      "versioning aliasing.\n");
3634     }
3635
3636   /* Requires loop versioning with niter checks.  */
3637   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3638     {
3639       /*  FIXME: Make cost depend on complexity of individual check.  */
3640       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3641                             NULL, NULL_TREE, 0, vect_prologue);
3642       if (dump_enabled_p ())
3643         dump_printf (MSG_NOTE,
3644                      "cost model: Adding cost of checks for loop "
3645                      "versioning niters.\n");
3646     }
3647
3648   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3649     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3650                           NULL, NULL_TREE, 0, vect_prologue);
3651
3652   /* Count statements in scalar loop.  Using this as scalar cost for a single
3653      iteration for now.
3654
3655      TODO: Add outer loop support.
3656
3657      TODO: Consider assigning different costs to different scalar
3658      statements.  */
3659
3660   scalar_single_iter_cost
3661     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3662
3663   /* Add additional cost for the peeled instructions in prologue and epilogue
3664      loop.  (For fully-masked loops there will be no peeling.)
3665
3666      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3667      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3668
3669      TODO: Build an expression that represents peel_iters for prologue and
3670      epilogue to be used in a run-time test.  */
3671
3672   bool prologue_need_br_taken_cost = false;
3673   bool prologue_need_br_not_taken_cost = false;
3674
3675   /* Calculate peel_iters_prologue.  */
3676   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3677     peel_iters_prologue = 0;
3678   else if (npeel < 0)
3679     {
3680       peel_iters_prologue = assumed_vf / 2;
3681       if (dump_enabled_p ())
3682         dump_printf (MSG_NOTE, "cost model: "
3683                      "prologue peel iters set to vf/2.\n");
3684
3685       /* If peeled iterations are unknown, count a taken branch and a not taken
3686          branch per peeled loop.  Even if scalar loop iterations are known,
3687          vector iterations are not known since peeled prologue iterations are
3688          not known.  Hence guards remain the same.  */
3689       prologue_need_br_taken_cost = true;
3690       prologue_need_br_not_taken_cost = true;
3691     }
3692   else
3693     {
3694       peel_iters_prologue = npeel;
3695       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3696         /* If peeled iterations are known but number of scalar loop
3697            iterations are unknown, count a taken branch per peeled loop.  */
3698         prologue_need_br_taken_cost = true;
3699     }
3700
3701   bool epilogue_need_br_taken_cost = false;
3702   bool epilogue_need_br_not_taken_cost = false;
3703
3704   /* Calculate peel_iters_epilogue.  */
3705   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3706     /* We need to peel exactly one iteration for gaps.  */
3707     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3708   else if (npeel < 0)
3709     {
3710       /* If peeling for alignment is unknown, loop bound of main loop
3711          becomes unknown.  */
3712       peel_iters_epilogue = assumed_vf / 2;
3713       if (dump_enabled_p ())
3714         dump_printf (MSG_NOTE, "cost model: "
3715                      "epilogue peel iters set to vf/2 because "
3716                      "peeling for alignment is unknown.\n");
3717
3718       /* See the same reason above in peel_iters_prologue calculation.  */
3719       epilogue_need_br_taken_cost = true;
3720       epilogue_need_br_not_taken_cost = true;
3721     }
3722   else
3723     {
3724       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3725       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3726         /* If peeled iterations are known but number of scalar loop
3727            iterations are unknown, count a taken branch per peeled loop.  */
3728         epilogue_need_br_taken_cost = true;
3729     }
3730
3731   stmt_info_for_cost *si;
3732   int j;
3733   /* Add costs associated with peel_iters_prologue.  */
3734   if (peel_iters_prologue)
3735     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3736       {
3737         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3738                               si->count * peel_iters_prologue, si->kind,
3739                               si->stmt_info, si->vectype, si->misalign,
3740                               vect_prologue);
3741       }
3742
3743   /* Add costs associated with peel_iters_epilogue.  */
3744   if (peel_iters_epilogue)
3745     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3746       {
3747         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3748                               si->count * peel_iters_epilogue, si->kind,
3749                               si->stmt_info, si->vectype, si->misalign,
3750                               vect_epilogue);
3751       }
3752
3753   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3754
3755   if (prologue_need_br_taken_cost)
3756     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3757                           NULL, NULL_TREE, 0, vect_prologue);
3758
3759   if (prologue_need_br_not_taken_cost)
3760     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3761                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3762                           vect_prologue);
3763
3764   if (epilogue_need_br_taken_cost)
3765     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3766                           NULL, NULL_TREE, 0, vect_epilogue);
3767
3768   if (epilogue_need_br_not_taken_cost)
3769     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3770                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3771                           vect_epilogue);
3772
3773   /* Take care of special costs for rgroup controls of partial vectors.  */
3774   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3775     {
3776       /* Calculate how many masks we need to generate.  */
3777       unsigned int num_masks = 0;
3778       rgroup_controls *rgm;
3779       unsigned int num_vectors_m1;
3780       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3781         if (rgm->type)
3782           num_masks += num_vectors_m1 + 1;
3783       gcc_assert (num_masks > 0);
3784
3785       /* In the worst case, we need to generate each mask in the prologue
3786          and in the loop body.  One of the loop body mask instructions
3787          replaces the comparison in the scalar loop, and since we don't
3788          count the scalar comparison against the scalar body, we shouldn't
3789          count that vector instruction against the vector body either.
3790
3791          Sometimes we can use unpacks instead of generating prologue
3792          masks and sometimes the prologue mask will fold to a constant,
3793          so the actual prologue cost might be smaller.  However, it's
3794          simpler and safer to use the worst-case cost; if this ends up
3795          being the tie-breaker between vectorizing or not, then it's
3796          probably better not to vectorize.  */
3797       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3798                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3799       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3800                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
3801     }
3802   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3803     {
3804       /* Referring to the functions vect_set_loop_condition_partial_vectors
3805          and vect_set_loop_controls_directly, we need to generate each
3806          length in the prologue and in the loop body if required. Although
3807          there are some possible optimizations, we consider the worst case
3808          here.  */
3809
3810       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3811       bool need_iterate_p
3812         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3813            && !vect_known_niters_smaller_than_vf (loop_vinfo));
3814
3815       /* Calculate how many statements to be added.  */
3816       unsigned int prologue_stmts = 0;
3817       unsigned int body_stmts = 0;
3818
3819       rgroup_controls *rgc;
3820       unsigned int num_vectors_m1;
3821       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3822         if (rgc->type)
3823           {
3824             /* May need one SHIFT for nitems_total computation.  */
3825             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3826             if (nitems != 1 && !niters_known_p)
3827               prologue_stmts += 1;
3828
3829             /* May need one MAX and one MINUS for wrap around.  */
3830             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3831               prologue_stmts += 2;
3832
3833             /* Need one MAX and one MINUS for each batch limit excepting for
3834                the 1st one.  */
3835             prologue_stmts += num_vectors_m1 * 2;
3836
3837             unsigned int num_vectors = num_vectors_m1 + 1;
3838
3839             /* Need to set up lengths in prologue, only one MIN required
3840                for each since start index is zero.  */
3841             prologue_stmts += num_vectors;
3842
3843             /* Each may need two MINs and one MINUS to update lengths in body
3844                for next iteration.  */
3845             if (need_iterate_p)
3846               body_stmts += 3 * num_vectors;
3847           }
3848
3849       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3850                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3851       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3852                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3853     }
3854
3855   /* FORNOW: The scalar outside cost is incremented in one of the
3856      following ways:
3857
3858      1. The vectorizer checks for alignment and aliasing and generates
3859      a condition that allows dynamic vectorization.  A cost model
3860      check is ANDED with the versioning condition.  Hence scalar code
3861      path now has the added cost of the versioning check.
3862
3863        if (cost > th & versioning_check)
3864          jmp to vector code
3865
3866      Hence run-time scalar is incremented by not-taken branch cost.
3867
3868      2. The vectorizer then checks if a prologue is required.  If the
3869      cost model check was not done before during versioning, it has to
3870      be done before the prologue check.
3871
3872        if (cost <= th)
3873          prologue = scalar_iters
3874        if (prologue == 0)
3875          jmp to vector code
3876        else
3877          execute prologue
3878        if (prologue == num_iters)
3879          go to exit
3880
3881      Hence the run-time scalar cost is incremented by a taken branch,
3882      plus a not-taken branch, plus a taken branch cost.
3883
3884      3. The vectorizer then checks if an epilogue is required.  If the
3885      cost model check was not done before during prologue check, it
3886      has to be done with the epilogue check.
3887
3888        if (prologue == 0)
3889          jmp to vector code
3890        else
3891          execute prologue
3892        if (prologue == num_iters)
3893          go to exit
3894        vector code:
3895          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3896            jmp to epilogue
3897
3898      Hence the run-time scalar cost should be incremented by 2 taken
3899      branches.
3900
3901      TODO: The back end may reorder the BBS's differently and reverse
3902      conditions/branch directions.  Change the estimates below to
3903      something more reasonable.  */
3904
3905   /* If the number of iterations is known and we do not do versioning, we can
3906      decide whether to vectorize at compile time.  Hence the scalar version
3907      do not carry cost model guard costs.  */
3908   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3909       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3910     {
3911       /* Cost model check occurs at versioning.  */
3912       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3913         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3914       else
3915         {
3916           /* Cost model check occurs at prologue generation.  */
3917           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3918             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3919               + vect_get_stmt_cost (cond_branch_not_taken);
3920           /* Cost model check occurs at epilogue generation.  */
3921           else
3922             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3923         }
3924     }
3925
3926   /* Complete the target-specific cost calculations.  */
3927   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3928                &vec_inside_cost, &vec_epilogue_cost);
3929
3930   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3931
3932   /* Stash the costs so that we can compare two loop_vec_infos.  */
3933   loop_vinfo->vec_inside_cost = vec_inside_cost;
3934   loop_vinfo->vec_outside_cost = vec_outside_cost;
3935
3936   if (dump_enabled_p ())
3937     {
3938       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3939       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3940                    vec_inside_cost);
3941       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3942                    vec_prologue_cost);
3943       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3944                    vec_epilogue_cost);
3945       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3946                    scalar_single_iter_cost);
3947       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3948                    scalar_outside_cost);
3949       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3950                    vec_outside_cost);
3951       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3952                    peel_iters_prologue);
3953       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3954                    peel_iters_epilogue);
3955     }
3956
3957   /* Calculate number of iterations required to make the vector version
3958      profitable, relative to the loop bodies only.  The following condition
3959      must hold true:
3960      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3961      where
3962      SIC = scalar iteration cost, VIC = vector iteration cost,
3963      VOC = vector outside cost, VF = vectorization factor,
3964      NPEEL = prologue iterations + epilogue iterations,
3965      SOC = scalar outside cost for run time cost model check.  */
3966
3967   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3968                           - vec_inside_cost);
3969   if (saving_per_viter <= 0)
3970     {
3971       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3972         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3973                     "vectorization did not happen for a simd loop");
3974
3975       if (dump_enabled_p ())
3976         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977                          "cost model: the vector iteration cost = %d "
3978                          "divided by the scalar iteration cost = %d "
3979                          "is greater or equal to the vectorization factor = %d"
3980                          ".\n",
3981                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3982       *ret_min_profitable_niters = -1;
3983       *ret_min_profitable_estimate = -1;
3984       return;
3985     }
3986
3987   /* ??? The "if" arm is written to handle all cases; see below for what
3988      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
3989   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3990     {
3991       /* Rewriting the condition above in terms of the number of
3992          vector iterations (vniters) rather than the number of
3993          scalar iterations (niters) gives:
3994
3995          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3996
3997          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3998
3999          For integer N, X and Y when X > 0:
4000
4001          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4002       int outside_overhead = (vec_outside_cost
4003                               - scalar_single_iter_cost * peel_iters_prologue
4004                               - scalar_single_iter_cost * peel_iters_epilogue
4005                               - scalar_outside_cost);
4006       /* We're only interested in cases that require at least one
4007          vector iteration.  */
4008       int min_vec_niters = 1;
4009       if (outside_overhead > 0)
4010         min_vec_niters = outside_overhead / saving_per_viter + 1;
4011
4012       if (dump_enabled_p ())
4013         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4014                      min_vec_niters);
4015
4016       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4017         {
4018           /* Now that we know the minimum number of vector iterations,
4019              find the minimum niters for which the scalar cost is larger:
4020
4021              SIC * niters > VIC * vniters + VOC - SOC
4022
4023              We know that the minimum niters is no more than
4024              vniters * VF + NPEEL, but it might be (and often is) less
4025              than that if a partial vector iteration is cheaper than the
4026              equivalent scalar code.  */
4027           int threshold = (vec_inside_cost * min_vec_niters
4028                            + vec_outside_cost
4029                            - scalar_outside_cost);
4030           if (threshold <= 0)
4031             min_profitable_iters = 1;
4032           else
4033             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4034         }
4035       else
4036         /* Convert the number of vector iterations into a number of
4037            scalar iterations.  */
4038         min_profitable_iters = (min_vec_niters * assumed_vf
4039                                 + peel_iters_prologue
4040                                 + peel_iters_epilogue);
4041     }
4042   else
4043     {
4044       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4045                               * assumed_vf
4046                               - vec_inside_cost * peel_iters_prologue
4047                               - vec_inside_cost * peel_iters_epilogue);
4048       if (min_profitable_iters <= 0)
4049         min_profitable_iters = 0;
4050       else
4051         {
4052           min_profitable_iters /= saving_per_viter;
4053
4054           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4055               <= (((int) vec_inside_cost * min_profitable_iters)
4056                   + (((int) vec_outside_cost - scalar_outside_cost)
4057                      * assumed_vf)))
4058             min_profitable_iters++;
4059         }
4060     }
4061
4062   if (dump_enabled_p ())
4063     dump_printf (MSG_NOTE,
4064                  "  Calculated minimum iters for profitability: %d\n",
4065                  min_profitable_iters);
4066
4067   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4068       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4069     /* We want the vectorized loop to execute at least once.  */
4070     min_profitable_iters = assumed_vf + peel_iters_prologue;
4071   else if (min_profitable_iters < peel_iters_prologue)
4072     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4073        vectorized loop executes at least once.  */
4074     min_profitable_iters = peel_iters_prologue;
4075
4076   if (dump_enabled_p ())
4077     dump_printf_loc (MSG_NOTE, vect_location,
4078                      "  Runtime profitability threshold = %d\n",
4079                      min_profitable_iters);
4080
4081   *ret_min_profitable_niters = min_profitable_iters;
4082
4083   /* Calculate number of iterations required to make the vector version
4084      profitable, relative to the loop bodies only.
4085
4086      Non-vectorized variant is SIC * niters and it must win over vector
4087      variant on the expected loop trip count.  The following condition must hold true:
4088      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4089
4090   if (vec_outside_cost <= 0)
4091     min_profitable_estimate = 0;
4092   /* ??? This "else if" arm is written to handle all cases; see below for
4093      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4094   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4095     {
4096       /* This is a repeat of the code above, but with + SOC rather
4097          than - SOC.  */
4098       int outside_overhead = (vec_outside_cost
4099                               - scalar_single_iter_cost * peel_iters_prologue
4100                               - scalar_single_iter_cost * peel_iters_epilogue
4101                               + scalar_outside_cost);
4102       int min_vec_niters = 1;
4103       if (outside_overhead > 0)
4104         min_vec_niters = outside_overhead / saving_per_viter + 1;
4105
4106       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4107         {
4108           int threshold = (vec_inside_cost * min_vec_niters
4109                            + vec_outside_cost
4110                            + scalar_outside_cost);
4111           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4112         }
4113       else
4114         min_profitable_estimate = (min_vec_niters * assumed_vf
4115                                    + peel_iters_prologue
4116                                    + peel_iters_epilogue);
4117     }
4118   else
4119     {
4120       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4121                                  * assumed_vf
4122                                  - vec_inside_cost * peel_iters_prologue
4123                                  - vec_inside_cost * peel_iters_epilogue)
4124                                  / ((scalar_single_iter_cost * assumed_vf)
4125                                    - vec_inside_cost);
4126     }
4127   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4128   if (dump_enabled_p ())
4129     dump_printf_loc (MSG_NOTE, vect_location,
4130                      "  Static estimate profitability threshold = %d\n",
4131                      min_profitable_estimate);
4132
4133   *ret_min_profitable_estimate = min_profitable_estimate;
4134 }
4135
4136 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4137    vector elements (not bits) for a vector with NELT elements.  */
4138 static void
4139 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4140                               vec_perm_builder *sel)
4141 {
4142   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4143      by vec_perm_indices.  */
4144   sel->new_vector (nelt, 1, 3);
4145   for (unsigned int i = 0; i < 3; i++)
4146     sel->quick_push (i + offset);
4147 }
4148
4149 /* Checks whether the target supports whole-vector shifts for vectors of mode
4150    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4151    it supports vec_perm_const with masks for all necessary shift amounts.  */
4152 static bool
4153 have_whole_vector_shift (machine_mode mode)
4154 {
4155   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4156     return true;
4157
4158   /* Variable-length vectors should be handled via the optab.  */
4159   unsigned int nelt;
4160   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4161     return false;
4162
4163   vec_perm_builder sel;
4164   vec_perm_indices indices;
4165   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4166     {
4167       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4168       indices.new_vector (sel, 2, nelt);
4169       if (!can_vec_perm_const_p (mode, indices, false))
4170         return false;
4171     }
4172   return true;
4173 }
4174
4175 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4176    functions. Design better to avoid maintenance issues.  */
4177
4178 /* Function vect_model_reduction_cost.
4179
4180    Models cost for a reduction operation, including the vector ops
4181    generated within the strip-mine loop, the initial definition before
4182    the loop, and the epilogue code that must be generated.  */
4183
4184 static void
4185 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4186                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4187                            vect_reduction_type reduction_type,
4188                            int ncopies, stmt_vector_for_cost *cost_vec)
4189 {
4190   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4191   enum tree_code code;
4192   optab optab;
4193   tree vectype;
4194   machine_mode mode;
4195   class loop *loop = NULL;
4196
4197   if (loop_vinfo)
4198     loop = LOOP_VINFO_LOOP (loop_vinfo);
4199
4200   /* Condition reductions generate two reductions in the loop.  */
4201   if (reduction_type == COND_REDUCTION)
4202     ncopies *= 2;
4203
4204   vectype = STMT_VINFO_VECTYPE (stmt_info);
4205   mode = TYPE_MODE (vectype);
4206   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4207
4208   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4209
4210   if (reduction_type == EXTRACT_LAST_REDUCTION)
4211     /* No extra instructions are needed in the prologue.  The loop body
4212        operations are costed in vectorizable_condition.  */
4213     inside_cost = 0;
4214   else if (reduction_type == FOLD_LEFT_REDUCTION)
4215     {
4216       /* No extra instructions needed in the prologue.  */
4217       prologue_cost = 0;
4218
4219       if (reduc_fn != IFN_LAST)
4220         /* Count one reduction-like operation per vector.  */
4221         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4222                                         stmt_info, 0, vect_body);
4223       else
4224         {
4225           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4226           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4227           inside_cost = record_stmt_cost (cost_vec, nelements,
4228                                           vec_to_scalar, stmt_info, 0,
4229                                           vect_body);
4230           inside_cost += record_stmt_cost (cost_vec, nelements,
4231                                            scalar_stmt, stmt_info, 0,
4232                                            vect_body);
4233         }
4234     }
4235   else
4236     {
4237       /* Add in cost for initial definition.
4238          For cond reduction we have four vectors: initial index, step,
4239          initial result of the data reduction, initial value of the index
4240          reduction.  */
4241       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4242       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4243                                          scalar_to_vec, stmt_info, 0,
4244                                          vect_prologue);
4245
4246       /* Cost of reduction op inside loop.  */
4247       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4248                                       stmt_info, 0, vect_body);
4249     }
4250
4251   /* Determine cost of epilogue code.
4252
4253      We have a reduction operator that will reduce the vector in one statement.
4254      Also requires scalar extract.  */
4255
4256   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4257     {
4258       if (reduc_fn != IFN_LAST)
4259         {
4260           if (reduction_type == COND_REDUCTION)
4261             {
4262               /* An EQ stmt and an COND_EXPR stmt.  */
4263               epilogue_cost += record_stmt_cost (cost_vec, 2,
4264                                                  vector_stmt, stmt_info, 0,
4265                                                  vect_epilogue);
4266               /* Reduction of the max index and a reduction of the found
4267                  values.  */
4268               epilogue_cost += record_stmt_cost (cost_vec, 2,
4269                                                  vec_to_scalar, stmt_info, 0,
4270                                                  vect_epilogue);
4271               /* A broadcast of the max value.  */
4272               epilogue_cost += record_stmt_cost (cost_vec, 1,
4273                                                  scalar_to_vec, stmt_info, 0,
4274                                                  vect_epilogue);
4275             }
4276           else
4277             {
4278               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4279                                                  stmt_info, 0, vect_epilogue);
4280               epilogue_cost += record_stmt_cost (cost_vec, 1,
4281                                                  vec_to_scalar, stmt_info, 0,
4282                                                  vect_epilogue);
4283             }
4284         }
4285       else if (reduction_type == COND_REDUCTION)
4286         {
4287           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4288           /* Extraction of scalar elements.  */
4289           epilogue_cost += record_stmt_cost (cost_vec,
4290                                              2 * estimated_nunits,
4291                                              vec_to_scalar, stmt_info, 0,
4292                                              vect_epilogue);
4293           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4294           epilogue_cost += record_stmt_cost (cost_vec,
4295                                              2 * estimated_nunits - 3,
4296                                              scalar_stmt, stmt_info, 0,
4297                                              vect_epilogue);
4298         }
4299       else if (reduction_type == EXTRACT_LAST_REDUCTION
4300                || reduction_type == FOLD_LEFT_REDUCTION)
4301         /* No extra instructions need in the epilogue.  */
4302         ;
4303       else
4304         {
4305           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4306           tree bitsize =
4307             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4308           int element_bitsize = tree_to_uhwi (bitsize);
4309           int nelements = vec_size_in_bits / element_bitsize;
4310
4311           if (code == COND_EXPR)
4312             code = MAX_EXPR;
4313
4314           optab = optab_for_tree_code (code, vectype, optab_default);
4315
4316           /* We have a whole vector shift available.  */
4317           if (optab != unknown_optab
4318               && VECTOR_MODE_P (mode)
4319               && optab_handler (optab, mode) != CODE_FOR_nothing
4320               && have_whole_vector_shift (mode))
4321             {
4322               /* Final reduction via vector shifts and the reduction operator.
4323                  Also requires scalar extract.  */
4324               epilogue_cost += record_stmt_cost (cost_vec,
4325                                                  exact_log2 (nelements) * 2,
4326                                                  vector_stmt, stmt_info, 0,
4327                                                  vect_epilogue);
4328               epilogue_cost += record_stmt_cost (cost_vec, 1,
4329                                                  vec_to_scalar, stmt_info, 0,
4330                                                  vect_epilogue);
4331             }
4332           else
4333             /* Use extracts and reduction op for final reduction.  For N
4334                elements, we have N extracts and N-1 reduction ops.  */
4335             epilogue_cost += record_stmt_cost (cost_vec,
4336                                                nelements + nelements - 1,
4337                                                vector_stmt, stmt_info, 0,
4338                                                vect_epilogue);
4339         }
4340     }
4341
4342   if (dump_enabled_p ())
4343     dump_printf (MSG_NOTE,
4344                  "vect_model_reduction_cost: inside_cost = %d, "
4345                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4346                  prologue_cost, epilogue_cost);
4347 }
4348
4349
4350 /* Function vect_model_induction_cost.
4351
4352    Models cost for induction operations.  */
4353
4354 static void
4355 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4356                            stmt_vector_for_cost *cost_vec)
4357 {
4358   unsigned inside_cost, prologue_cost;
4359
4360   if (PURE_SLP_STMT (stmt_info))
4361     return;
4362
4363   /* loop cost for vec_loop.  */
4364   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4365                                   stmt_info, 0, vect_body);
4366
4367   /* prologue cost for vec_init and vec_step.  */
4368   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4369                                     stmt_info, 0, vect_prologue);
4370
4371   if (dump_enabled_p ())
4372     dump_printf_loc (MSG_NOTE, vect_location,
4373                      "vect_model_induction_cost: inside_cost = %d, "
4374                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4375 }
4376
4377
4378
4379 /* Function get_initial_def_for_reduction
4380
4381    Input:
4382    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4383    INIT_VAL - the initial value of the reduction variable
4384
4385    Output:
4386    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4387         of the reduction (used for adjusting the epilog - see below).
4388    Return a vector variable, initialized according to the operation that
4389         STMT_VINFO performs. This vector will be used as the initial value
4390         of the vector of partial results.
4391
4392    Option1 (adjust in epilog): Initialize the vector as follows:
4393      add/bit or/xor:    [0,0,...,0,0]
4394      mult/bit and:      [1,1,...,1,1]
4395      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4396    and when necessary (e.g. add/mult case) let the caller know
4397    that it needs to adjust the result by init_val.
4398
4399    Option2: Initialize the vector as follows:
4400      add/bit or/xor:    [init_val,0,0,...,0]
4401      mult/bit and:      [init_val,1,1,...,1]
4402      min/max/cond_expr: [init_val,init_val,...,init_val]
4403    and no adjustments are needed.
4404
4405    For example, for the following code:
4406
4407    s = init_val;
4408    for (i=0;i<n;i++)
4409      s = s + a[i];
4410
4411    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4412    For a vector of 4 units, we want to return either [0,0,0,init_val],
4413    or [0,0,0,0] and let the caller know that it needs to adjust
4414    the result at the end by 'init_val'.
4415
4416    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4417    initialization vector is simpler (same element in all entries), if
4418    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4419
4420    A cost model should help decide between these two schemes.  */
4421
4422 static tree
4423 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4424                                stmt_vec_info stmt_vinfo,
4425                                enum tree_code code, tree init_val,
4426                                tree *adjustment_def)
4427 {
4428   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4429   tree scalar_type = TREE_TYPE (init_val);
4430   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4431   tree def_for_init;
4432   tree init_def;
4433   REAL_VALUE_TYPE real_init_val = dconst0;
4434   int int_init_val = 0;
4435   gimple_seq stmts = NULL;
4436
4437   gcc_assert (vectype);
4438
4439   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4440               || SCALAR_FLOAT_TYPE_P (scalar_type));
4441
4442   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4443               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4444
4445   /* ADJUSTMENT_DEF is NULL when called from
4446      vect_create_epilog_for_reduction to vectorize double reduction.  */
4447   if (adjustment_def)
4448     *adjustment_def = NULL;
4449
4450   switch (code)
4451     {
4452     case WIDEN_SUM_EXPR:
4453     case DOT_PROD_EXPR:
4454     case SAD_EXPR:
4455     case PLUS_EXPR:
4456     case MINUS_EXPR:
4457     case BIT_IOR_EXPR:
4458     case BIT_XOR_EXPR:
4459     case MULT_EXPR:
4460     case BIT_AND_EXPR:
4461       {
4462         if (code == MULT_EXPR)
4463           {
4464             real_init_val = dconst1;
4465             int_init_val = 1;
4466           }
4467
4468         if (code == BIT_AND_EXPR)
4469           int_init_val = -1;
4470
4471         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4472           def_for_init = build_real (scalar_type, real_init_val);
4473         else
4474           def_for_init = build_int_cst (scalar_type, int_init_val);
4475
4476         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4477           {
4478             /* Option1: the first element is '0' or '1' as well.  */
4479             if (!operand_equal_p (def_for_init, init_val, 0))
4480               *adjustment_def = init_val;
4481             init_def = gimple_build_vector_from_val (&stmts, vectype,
4482                                                      def_for_init);
4483           }
4484         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4485           {
4486             /* Option2 (variable length): the first element is INIT_VAL.  */
4487             init_def = gimple_build_vector_from_val (&stmts, vectype,
4488                                                      def_for_init);
4489             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4490                                      vectype, init_def, init_val);
4491           }
4492         else
4493           {
4494             /* Option2: the first element is INIT_VAL.  */
4495             tree_vector_builder elts (vectype, 1, 2);
4496             elts.quick_push (init_val);
4497             elts.quick_push (def_for_init);
4498             init_def = gimple_build_vector (&stmts, &elts);
4499           }
4500       }
4501       break;
4502
4503     case MIN_EXPR:
4504     case MAX_EXPR:
4505     case COND_EXPR:
4506       {
4507         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4508         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4509       }
4510       break;
4511
4512     default:
4513       gcc_unreachable ();
4514     }
4515
4516   if (stmts)
4517     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4518   return init_def;
4519 }
4520
4521 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4522    NUMBER_OF_VECTORS is the number of vector defs to create.
4523    If NEUTRAL_OP is nonnull, introducing extra elements of that
4524    value will not change the result.  */
4525
4526 static void
4527 get_initial_defs_for_reduction (vec_info *vinfo,
4528                                 slp_tree slp_node,
4529                                 vec<tree> *vec_oprnds,
4530                                 unsigned int number_of_vectors,
4531                                 bool reduc_chain, tree neutral_op)
4532 {
4533   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4534   stmt_vec_info stmt_vinfo = stmts[0];
4535   unsigned HOST_WIDE_INT nunits;
4536   unsigned j, number_of_places_left_in_vector;
4537   tree vector_type;
4538   unsigned int group_size = stmts.length ();
4539   unsigned int i;
4540   class loop *loop;
4541
4542   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4543
4544   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4545
4546   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4547   gcc_assert (loop);
4548   edge pe = loop_preheader_edge (loop);
4549
4550   gcc_assert (!reduc_chain || neutral_op);
4551
4552   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4553      created vectors. It is greater than 1 if unrolling is performed.
4554
4555      For example, we have two scalar operands, s1 and s2 (e.g., group of
4556      strided accesses of size two), while NUNITS is four (i.e., four scalars
4557      of this type can be packed in a vector).  The output vector will contain
4558      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4559      will be 2).
4560
4561      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4562      vectors containing the operands.
4563
4564      For example, NUNITS is four as before, and the group size is 8
4565      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4566      {s5, s6, s7, s8}.  */
4567
4568   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4569     nunits = group_size;
4570
4571   number_of_places_left_in_vector = nunits;
4572   bool constant_p = true;
4573   tree_vector_builder elts (vector_type, nunits, 1);
4574   elts.quick_grow (nunits);
4575   gimple_seq ctor_seq = NULL;
4576   for (j = 0; j < nunits * number_of_vectors; ++j)
4577     {
4578       tree op;
4579       i = j % group_size;
4580       stmt_vinfo = stmts[i];
4581
4582       /* Get the def before the loop.  In reduction chain we have only
4583          one initial value.  Else we have as many as PHIs in the group.  */
4584       if (reduc_chain)
4585         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4586       else if (((vec_oprnds->length () + 1) * nunits
4587                 - number_of_places_left_in_vector >= group_size)
4588                && neutral_op)
4589         op = neutral_op;
4590       else
4591         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4592
4593       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4594       number_of_places_left_in_vector--;
4595       elts[nunits - number_of_places_left_in_vector - 1] = op;
4596       if (!CONSTANT_CLASS_P (op))
4597         constant_p = false;
4598
4599       if (number_of_places_left_in_vector == 0)
4600         {
4601           tree init;
4602           if (constant_p && !neutral_op
4603               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4604               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4605             /* Build the vector directly from ELTS.  */
4606             init = gimple_build_vector (&ctor_seq, &elts);
4607           else if (neutral_op)
4608             {
4609               /* Build a vector of the neutral value and shift the
4610                  other elements into place.  */
4611               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4612                                                    neutral_op);
4613               int k = nunits;
4614               while (k > 0 && elts[k - 1] == neutral_op)
4615                 k -= 1;
4616               while (k > 0)
4617                 {
4618                   k -= 1;
4619                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4620                                        vector_type, init, elts[k]);
4621                 }
4622             }
4623           else
4624             {
4625               /* First time round, duplicate ELTS to fill the
4626                  required number of vectors.  */
4627               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4628                                         number_of_vectors, *vec_oprnds);
4629               break;
4630             }
4631           vec_oprnds->quick_push (init);
4632
4633           number_of_places_left_in_vector = nunits;
4634           elts.new_vector (vector_type, nunits, 1);
4635           elts.quick_grow (nunits);
4636           constant_p = true;
4637         }
4638     }
4639   if (ctor_seq != NULL)
4640     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4641 }
4642
4643 /* For a statement STMT_INFO taking part in a reduction operation return
4644    the stmt_vec_info the meta information is stored on.  */
4645
4646 stmt_vec_info
4647 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4648 {
4649   stmt_info = vect_orig_stmt (stmt_info);
4650   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4651   if (!is_a <gphi *> (stmt_info->stmt)
4652       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4653     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4654   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4655   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4656     {
4657       if (gimple_phi_num_args (phi) == 1)
4658         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4659     }
4660   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4661     {
4662       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4663       stmt_vec_info info
4664           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4665       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4666         stmt_info = info;
4667     }
4668   return stmt_info;
4669 }
4670
4671 /* Function vect_create_epilog_for_reduction
4672
4673    Create code at the loop-epilog to finalize the result of a reduction
4674    computation.
4675
4676    STMT_INFO is the scalar reduction stmt that is being vectorized.
4677    SLP_NODE is an SLP node containing a group of reduction statements. The
4678      first one in this group is STMT_INFO.
4679    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4680    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4681      (counting from 0)
4682
4683    This function:
4684    1. Completes the reduction def-use cycles.
4685    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4686       by calling the function specified by REDUC_FN if available, or by
4687       other means (whole-vector shifts or a scalar loop).
4688       The function also creates a new phi node at the loop exit to preserve
4689       loop-closed form, as illustrated below.
4690
4691      The flow at the entry to this function:
4692
4693         loop:
4694           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4695           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4696           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4697         loop_exit:
4698           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4699           use <s_out0>
4700           use <s_out0>
4701
4702      The above is transformed by this function into:
4703
4704         loop:
4705           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4706           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4707           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4708         loop_exit:
4709           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4710           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4711           v_out2 = reduce <v_out1>
4712           s_out3 = extract_field <v_out2, 0>
4713           s_out4 = adjust_result <s_out3>
4714           use <s_out4>
4715           use <s_out4>
4716 */
4717
4718 static void
4719 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4720                                   stmt_vec_info stmt_info,
4721                                   slp_tree slp_node,
4722                                   slp_instance slp_node_instance)
4723 {
4724   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4725   gcc_assert (reduc_info->is_reduc_info);
4726   /* For double reductions we need to get at the inner loop reduction
4727      stmt which has the meta info attached.  Our stmt_info is that of the
4728      loop-closed PHI of the inner loop which we remember as
4729      def for the reduction PHI generation.  */
4730   bool double_reduc = false;
4731   stmt_vec_info rdef_info = stmt_info;
4732   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4733     {
4734       gcc_assert (!slp_node);
4735       double_reduc = true;
4736       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4737                                             (stmt_info->stmt, 0));
4738       stmt_info = vect_stmt_to_vectorize (stmt_info);
4739     }
4740   gphi *reduc_def_stmt
4741     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4742   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4743   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4744   tree vectype;
4745   machine_mode mode;
4746   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4747   basic_block exit_bb;
4748   tree scalar_dest;
4749   tree scalar_type;
4750   gimple *new_phi = NULL, *phi;
4751   gimple_stmt_iterator exit_gsi;
4752   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4753   gimple *epilog_stmt = NULL;
4754   gimple *exit_phi;
4755   tree bitsize;
4756   tree def;
4757   tree orig_name, scalar_result;
4758   imm_use_iterator imm_iter, phi_imm_iter;
4759   use_operand_p use_p, phi_use_p;
4760   gimple *use_stmt;
4761   bool nested_in_vect_loop = false;
4762   auto_vec<gimple *> new_phis;
4763   int j, i;
4764   auto_vec<tree> scalar_results;
4765   unsigned int group_size = 1, k;
4766   auto_vec<gimple *> phis;
4767   bool slp_reduc = false;
4768   bool direct_slp_reduc;
4769   tree new_phi_result;
4770   tree induction_index = NULL_TREE;
4771
4772   if (slp_node)
4773     group_size = SLP_TREE_LANES (slp_node);
4774
4775   if (nested_in_vect_loop_p (loop, stmt_info))
4776     {
4777       outer_loop = loop;
4778       loop = loop->inner;
4779       nested_in_vect_loop = true;
4780       gcc_assert (!slp_node);
4781     }
4782   gcc_assert (!nested_in_vect_loop || double_reduc);
4783
4784   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4785   gcc_assert (vectype);
4786   mode = TYPE_MODE (vectype);
4787
4788   tree initial_def = NULL;
4789   tree induc_val = NULL_TREE;
4790   tree adjustment_def = NULL;
4791   if (slp_node)
4792     ;
4793   else
4794     {
4795       /* Get at the scalar def before the loop, that defines the initial value
4796          of the reduction variable.  */
4797       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4798                                            loop_preheader_edge (loop));
4799       /* Optimize: for induction condition reduction, if we can't use zero
4800          for induc_val, use initial_def.  */
4801       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4802         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4803       else if (double_reduc)
4804         ;
4805       else if (nested_in_vect_loop)
4806         ;
4807       else
4808         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4809     }
4810
4811   unsigned vec_num;
4812   int ncopies;
4813   if (slp_node)
4814     {
4815       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4816       ncopies = 1;
4817     }
4818   else
4819     {
4820       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4821       vec_num = 1;
4822       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4823     }
4824
4825   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4826      which is updated with the current index of the loop for every match of
4827      the original loop's cond_expr (VEC_STMT).  This results in a vector
4828      containing the last time the condition passed for that vector lane.
4829      The first match will be a 1 to allow 0 to be used for non-matching
4830      indexes.  If there are no matches at all then the vector will be all
4831      zeroes.
4832
4833      PR92772: This algorithm is broken for architectures that support
4834      masked vectors, but do not provide fold_extract_last.  */
4835   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4836     {
4837       auto_vec<std::pair<tree, bool>, 2> ccompares;
4838       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4839       cond_info = vect_stmt_to_vectorize (cond_info);
4840       while (cond_info != reduc_info)
4841         {
4842           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4843             {
4844               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4845               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4846               ccompares.safe_push
4847                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4848                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4849             }
4850           cond_info
4851             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4852                                                  1 + STMT_VINFO_REDUC_IDX
4853                                                         (cond_info)));
4854           cond_info = vect_stmt_to_vectorize (cond_info);
4855         }
4856       gcc_assert (ccompares.length () != 0);
4857
4858       tree indx_before_incr, indx_after_incr;
4859       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4860       int scalar_precision
4861         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4862       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4863       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4864         (TYPE_MODE (vectype), cr_index_scalar_type,
4865          TYPE_VECTOR_SUBPARTS (vectype));
4866
4867       /* First we create a simple vector induction variable which starts
4868          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4869          vector size (STEP).  */
4870
4871       /* Create a {1,2,3,...} vector.  */
4872       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4873
4874       /* Create a vector of the step value.  */
4875       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4876       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4877
4878       /* Create an induction variable.  */
4879       gimple_stmt_iterator incr_gsi;
4880       bool insert_after;
4881       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4882       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4883                  insert_after, &indx_before_incr, &indx_after_incr);
4884
4885       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4886          filled with zeros (VEC_ZERO).  */
4887
4888       /* Create a vector of 0s.  */
4889       tree zero = build_zero_cst (cr_index_scalar_type);
4890       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4891
4892       /* Create a vector phi node.  */
4893       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4894       new_phi = create_phi_node (new_phi_tree, loop->header);
4895       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4896                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4897
4898       /* Now take the condition from the loops original cond_exprs
4899          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4900          every match uses values from the induction variable
4901          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4902          (NEW_PHI_TREE).
4903          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4904          the new cond_expr (INDEX_COND_EXPR).  */
4905       gimple_seq stmts = NULL;
4906       for (int i = ccompares.length () - 1; i != -1; --i)
4907         {
4908           tree ccompare = ccompares[i].first;
4909           if (ccompares[i].second)
4910             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4911                                          cr_index_vector_type,
4912                                          ccompare,
4913                                          indx_before_incr, new_phi_tree);
4914           else
4915             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4916                                          cr_index_vector_type,
4917                                          ccompare,
4918                                          new_phi_tree, indx_before_incr);
4919         }
4920       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4921
4922       /* Update the phi with the vec cond.  */
4923       induction_index = new_phi_tree;
4924       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4925                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4926     }
4927
4928   /* 2. Create epilog code.
4929         The reduction epilog code operates across the elements of the vector
4930         of partial results computed by the vectorized loop.
4931         The reduction epilog code consists of:
4932
4933         step 1: compute the scalar result in a vector (v_out2)
4934         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4935         step 3: adjust the scalar result (s_out3) if needed.
4936
4937         Step 1 can be accomplished using one the following three schemes:
4938           (scheme 1) using reduc_fn, if available.
4939           (scheme 2) using whole-vector shifts, if available.
4940           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4941                      combined.
4942
4943           The overall epilog code looks like this:
4944
4945           s_out0 = phi <s_loop>         # original EXIT_PHI
4946           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4947           v_out2 = reduce <v_out1>              # step 1
4948           s_out3 = extract_field <v_out2, 0>    # step 2
4949           s_out4 = adjust_result <s_out3>       # step 3
4950
4951           (step 3 is optional, and steps 1 and 2 may be combined).
4952           Lastly, the uses of s_out0 are replaced by s_out4.  */
4953
4954
4955   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4956          v_out1 = phi <VECT_DEF>
4957          Store them in NEW_PHIS.  */
4958   if (double_reduc)
4959     loop = outer_loop;
4960   exit_bb = single_exit (loop)->dest;
4961   new_phis.create (slp_node ? vec_num : ncopies);
4962   for (unsigned i = 0; i < vec_num; i++)
4963     {
4964       if (slp_node)
4965         def = vect_get_slp_vect_def (slp_node, i);
4966       else
4967         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4968       for (j = 0; j < ncopies; j++)
4969         {
4970           tree new_def = copy_ssa_name (def);
4971           phi = create_phi_node (new_def, exit_bb);
4972           if (j == 0)
4973             new_phis.quick_push (phi);
4974           else
4975             {
4976               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4977               new_phis.quick_push (phi);
4978             }
4979
4980           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4981         }
4982     }
4983
4984   exit_gsi = gsi_after_labels (exit_bb);
4985
4986   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4987          (i.e. when reduc_fn is not available) and in the final adjustment
4988          code (if needed).  Also get the original scalar reduction variable as
4989          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4990          represents a reduction pattern), the tree-code and scalar-def are
4991          taken from the original stmt that the pattern-stmt (STMT) replaces.
4992          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4993          are taken from STMT.  */
4994
4995   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4996   if (orig_stmt_info != stmt_info)
4997     {
4998       /* Reduction pattern  */
4999       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5000       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5001     }
5002
5003   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5004   scalar_type = TREE_TYPE (scalar_dest);
5005   scalar_results.create (group_size);
5006   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5007   bitsize = TYPE_SIZE (scalar_type);
5008
5009   /* SLP reduction without reduction chain, e.g.,
5010      # a1 = phi <a2, a0>
5011      # b1 = phi <b2, b0>
5012      a2 = operation (a1)
5013      b2 = operation (b1)  */
5014   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5015
5016   /* True if we should implement SLP_REDUC using native reduction operations
5017      instead of scalar operations.  */
5018   direct_slp_reduc = (reduc_fn != IFN_LAST
5019                       && slp_reduc
5020                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5021
5022   /* In case of reduction chain, e.g.,
5023      # a1 = phi <a3, a0>
5024      a2 = operation (a1)
5025      a3 = operation (a2),
5026
5027      we may end up with more than one vector result.  Here we reduce them to
5028      one vector.  */
5029   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5030     {
5031       gimple_seq stmts = NULL;
5032       tree first_vect = PHI_RESULT (new_phis[0]);
5033       first_vect = gimple_convert (&stmts, vectype, first_vect);
5034       for (k = 1; k < new_phis.length (); k++)
5035         {
5036           gimple *next_phi = new_phis[k];
5037           tree second_vect = PHI_RESULT (next_phi);
5038           second_vect = gimple_convert (&stmts, vectype, second_vect);
5039           first_vect = gimple_build (&stmts, code, vectype,
5040                                      first_vect, second_vect);
5041         }
5042       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5043
5044       new_phi_result = first_vect;
5045       new_phis.truncate (0);
5046       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5047     }
5048   /* Likewise if we couldn't use a single defuse cycle.  */
5049   else if (ncopies > 1)
5050     {
5051       gimple_seq stmts = NULL;
5052       tree first_vect = PHI_RESULT (new_phis[0]);
5053       first_vect = gimple_convert (&stmts, vectype, first_vect);
5054       for (int k = 1; k < ncopies; ++k)
5055         {
5056           tree second_vect = PHI_RESULT (new_phis[k]);
5057           second_vect = gimple_convert (&stmts, vectype, second_vect);
5058           first_vect = gimple_build (&stmts, code, vectype,
5059                                      first_vect, second_vect);
5060         }
5061       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5062       new_phi_result = first_vect;
5063       new_phis.truncate (0);
5064       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5065     }
5066   else
5067     new_phi_result = PHI_RESULT (new_phis[0]);
5068
5069   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5070       && reduc_fn != IFN_LAST)
5071     {
5072       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5073          various data values where the condition matched and another vector
5074          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5075          need to extract the last matching index (which will be the index with
5076          highest value) and use this to index into the data vector.
5077          For the case where there were no matches, the data vector will contain
5078          all default values and the index vector will be all zeros.  */
5079
5080       /* Get various versions of the type of the vector of indexes.  */
5081       tree index_vec_type = TREE_TYPE (induction_index);
5082       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5083       tree index_scalar_type = TREE_TYPE (index_vec_type);
5084       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5085
5086       /* Get an unsigned integer version of the type of the data vector.  */
5087       int scalar_precision
5088         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5089       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5090       tree vectype_unsigned = build_vector_type
5091         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5092
5093       /* First we need to create a vector (ZERO_VEC) of zeros and another
5094          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5095          can create using a MAX reduction and then expanding.
5096          In the case where the loop never made any matches, the max index will
5097          be zero.  */
5098
5099       /* Vector of {0, 0, 0,...}.  */
5100       tree zero_vec = build_zero_cst (vectype);
5101
5102       gimple_seq stmts = NULL;
5103       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5104       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5105
5106       /* Find maximum value from the vector of found indexes.  */
5107       tree max_index = make_ssa_name (index_scalar_type);
5108       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5109                                                           1, induction_index);
5110       gimple_call_set_lhs (max_index_stmt, max_index);
5111       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5112
5113       /* Vector of {max_index, max_index, max_index,...}.  */
5114       tree max_index_vec = make_ssa_name (index_vec_type);
5115       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5116                                                       max_index);
5117       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5118                                                         max_index_vec_rhs);
5119       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5120
5121       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5122          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5123          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5124          otherwise.  Only one value should match, resulting in a vector
5125          (VEC_COND) with one data value and the rest zeros.
5126          In the case where the loop never made any matches, every index will
5127          match, resulting in a vector with all data values (which will all be
5128          the default value).  */
5129
5130       /* Compare the max index vector to the vector of found indexes to find
5131          the position of the max value.  */
5132       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5133       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5134                                                       induction_index,
5135                                                       max_index_vec);
5136       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5137
5138       /* Use the compare to choose either values from the data vector or
5139          zero.  */
5140       tree vec_cond = make_ssa_name (vectype);
5141       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5142                                                    vec_compare, new_phi_result,
5143                                                    zero_vec);
5144       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5145
5146       /* Finally we need to extract the data value from the vector (VEC_COND)
5147          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5148          reduction, but because this doesn't exist, we can use a MAX reduction
5149          instead.  The data value might be signed or a float so we need to cast
5150          it first.
5151          In the case where the loop never made any matches, the data values are
5152          all identical, and so will reduce down correctly.  */
5153
5154       /* Make the matched data values unsigned.  */
5155       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5156       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5157                                        vec_cond);
5158       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5159                                                         VIEW_CONVERT_EXPR,
5160                                                         vec_cond_cast_rhs);
5161       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5162
5163       /* Reduce down to a scalar value.  */
5164       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5165       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5166                                                            1, vec_cond_cast);
5167       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5168       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5169
5170       /* Convert the reduced value back to the result type and set as the
5171          result.  */
5172       stmts = NULL;
5173       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5174                                data_reduc);
5175       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5176       scalar_results.safe_push (new_temp);
5177     }
5178   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5179            && reduc_fn == IFN_LAST)
5180     {
5181       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5182          idx = 0;
5183          idx_val = induction_index[0];
5184          val = data_reduc[0];
5185          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5186            if (induction_index[i] > idx_val)
5187              val = data_reduc[i], idx_val = induction_index[i];
5188          return val;  */
5189
5190       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5191       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5192       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5193       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5194       /* Enforced by vectorizable_reduction, which ensures we have target
5195          support before allowing a conditional reduction on variable-length
5196          vectors.  */
5197       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5198       tree idx_val = NULL_TREE, val = NULL_TREE;
5199       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5200         {
5201           tree old_idx_val = idx_val;
5202           tree old_val = val;
5203           idx_val = make_ssa_name (idx_eltype);
5204           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5205                                              build3 (BIT_FIELD_REF, idx_eltype,
5206                                                      induction_index,
5207                                                      bitsize_int (el_size),
5208                                                      bitsize_int (off)));
5209           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5210           val = make_ssa_name (data_eltype);
5211           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5212                                              build3 (BIT_FIELD_REF,
5213                                                      data_eltype,
5214                                                      new_phi_result,
5215                                                      bitsize_int (el_size),
5216                                                      bitsize_int (off)));
5217           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218           if (off != 0)
5219             {
5220               tree new_idx_val = idx_val;
5221               if (off != v_size - el_size)
5222                 {
5223                   new_idx_val = make_ssa_name (idx_eltype);
5224                   epilog_stmt = gimple_build_assign (new_idx_val,
5225                                                      MAX_EXPR, idx_val,
5226                                                      old_idx_val);
5227                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5228                 }
5229               tree new_val = make_ssa_name (data_eltype);
5230               epilog_stmt = gimple_build_assign (new_val,
5231                                                  COND_EXPR,
5232                                                  build2 (GT_EXPR,
5233                                                          boolean_type_node,
5234                                                          idx_val,
5235                                                          old_idx_val),
5236                                                  val, old_val);
5237               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5238               idx_val = new_idx_val;
5239               val = new_val;
5240             }
5241         }
5242       /* Convert the reduced value back to the result type and set as the
5243          result.  */
5244       gimple_seq stmts = NULL;
5245       val = gimple_convert (&stmts, scalar_type, val);
5246       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5247       scalar_results.safe_push (val);
5248     }
5249
5250   /* 2.3 Create the reduction code, using one of the three schemes described
5251          above. In SLP we simply need to extract all the elements from the
5252          vector (without reducing them), so we use scalar shifts.  */
5253   else if (reduc_fn != IFN_LAST && !slp_reduc)
5254     {
5255       tree tmp;
5256       tree vec_elem_type;
5257
5258       /* Case 1:  Create:
5259          v_out2 = reduc_expr <v_out1>  */
5260
5261       if (dump_enabled_p ())
5262         dump_printf_loc (MSG_NOTE, vect_location,
5263                          "Reduce using direct vector reduction.\n");
5264
5265       gimple_seq stmts = NULL;
5266       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5267       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5268       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5269                                vec_elem_type, new_phi_result);
5270       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5271       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5272
5273       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5274           && induc_val)
5275         {
5276           /* Earlier we set the initial value to be a vector if induc_val
5277              values.  Check the result and if it is induc_val then replace
5278              with the original initial value, unless induc_val is
5279              the same as initial_def already.  */
5280           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5281                                   induc_val);
5282
5283           tmp = make_ssa_name (new_scalar_dest);
5284           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5285                                              initial_def, new_temp);
5286           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5287           new_temp = tmp;
5288         }
5289
5290       scalar_results.safe_push (new_temp);
5291     }
5292   else if (direct_slp_reduc)
5293     {
5294       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5295          with the elements for other SLP statements replaced with the
5296          neutral value.  We can then do a normal reduction on each vector.  */
5297
5298       /* Enforced by vectorizable_reduction.  */
5299       gcc_assert (new_phis.length () == 1);
5300       gcc_assert (pow2p_hwi (group_size));
5301
5302       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5303       vec<stmt_vec_info> orig_phis
5304         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5305       gimple_seq seq = NULL;
5306
5307       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5308          and the same element size as VECTYPE.  */
5309       tree index = build_index_vector (vectype, 0, 1);
5310       tree index_type = TREE_TYPE (index);
5311       tree index_elt_type = TREE_TYPE (index_type);
5312       tree mask_type = truth_type_for (index_type);
5313
5314       /* Create a vector that, for each element, identifies which of
5315          the REDUC_GROUP_SIZE results should use it.  */
5316       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5317       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5318                             build_vector_from_val (index_type, index_mask));
5319
5320       /* Get a neutral vector value.  This is simply a splat of the neutral
5321          scalar value if we have one, otherwise the initial scalar value
5322          is itself a neutral value.  */
5323       tree vector_identity = NULL_TREE;
5324       tree neutral_op = NULL_TREE;
5325       if (slp_node)
5326         {
5327           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5328           neutral_op
5329             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5330                                             vectype, code, first != NULL);
5331         }
5332       if (neutral_op)
5333         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5334                                                         neutral_op);
5335       for (unsigned int i = 0; i < group_size; ++i)
5336         {
5337           /* If there's no univeral neutral value, we can use the
5338              initial scalar value from the original PHI.  This is used
5339              for MIN and MAX reduction, for example.  */
5340           if (!neutral_op)
5341             {
5342               tree scalar_value
5343                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5344                                          loop_preheader_edge (loop));
5345               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5346                                              scalar_value);
5347               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5348                                                               scalar_value);
5349             }
5350
5351           /* Calculate the equivalent of:
5352
5353              sel[j] = (index[j] == i);
5354
5355              which selects the elements of NEW_PHI_RESULT that should
5356              be included in the result.  */
5357           tree compare_val = build_int_cst (index_elt_type, i);
5358           compare_val = build_vector_from_val (index_type, compare_val);
5359           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5360                                    index, compare_val);
5361
5362           /* Calculate the equivalent of:
5363
5364              vec = seq ? new_phi_result : vector_identity;
5365
5366              VEC is now suitable for a full vector reduction.  */
5367           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5368                                    sel, new_phi_result, vector_identity);
5369
5370           /* Do the reduction and convert it to the appropriate type.  */
5371           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5372                                       TREE_TYPE (vectype), vec);
5373           scalar = gimple_convert (&seq, scalar_type, scalar);
5374           scalar_results.safe_push (scalar);
5375         }
5376       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5377     }
5378   else
5379     {
5380       bool reduce_with_shift;
5381       tree vec_temp;
5382
5383       gcc_assert (slp_reduc || new_phis.length () == 1);
5384
5385       /* See if the target wants to do the final (shift) reduction
5386          in a vector mode of smaller size and first reduce upper/lower
5387          halves against each other.  */
5388       enum machine_mode mode1 = mode;
5389       tree stype = TREE_TYPE (vectype);
5390       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5391       unsigned nunits1 = nunits;
5392       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5393           && new_phis.length () == 1)
5394         {
5395           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5396           /* For SLP reductions we have to make sure lanes match up, but
5397              since we're doing individual element final reduction reducing
5398              vector width here is even more important.
5399              ???  We can also separate lanes with permutes, for the common
5400              case of power-of-two group-size odd/even extracts would work.  */
5401           if (slp_reduc && nunits != nunits1)
5402             {
5403               nunits1 = least_common_multiple (nunits1, group_size);
5404               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5405             }
5406         }
5407       if (!slp_reduc
5408           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5409         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5410
5411       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5412                                                            stype, nunits1);
5413       reduce_with_shift = have_whole_vector_shift (mode1);
5414       if (!VECTOR_MODE_P (mode1))
5415         reduce_with_shift = false;
5416       else
5417         {
5418           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5419           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5420             reduce_with_shift = false;
5421         }
5422
5423       /* First reduce the vector to the desired vector size we should
5424          do shift reduction on by combining upper and lower halves.  */
5425       new_temp = new_phi_result;
5426       while (nunits > nunits1)
5427         {
5428           nunits /= 2;
5429           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5430                                                           stype, nunits);
5431           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5432
5433           /* The target has to make sure we support lowpart/highpart
5434              extraction, either via direct vector extract or through
5435              an integer mode punning.  */
5436           tree dst1, dst2;
5437           if (convert_optab_handler (vec_extract_optab,
5438                                      TYPE_MODE (TREE_TYPE (new_temp)),
5439                                      TYPE_MODE (vectype1))
5440               != CODE_FOR_nothing)
5441             {
5442               /* Extract sub-vectors directly once vec_extract becomes
5443                  a conversion optab.  */
5444               dst1 = make_ssa_name (vectype1);
5445               epilog_stmt
5446                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5447                                          build3 (BIT_FIELD_REF, vectype1,
5448                                                  new_temp, TYPE_SIZE (vectype1),
5449                                                  bitsize_int (0)));
5450               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5451               dst2 =  make_ssa_name (vectype1);
5452               epilog_stmt
5453                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5454                                          build3 (BIT_FIELD_REF, vectype1,
5455                                                  new_temp, TYPE_SIZE (vectype1),
5456                                                  bitsize_int (bitsize)));
5457               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5458             }
5459           else
5460             {
5461               /* Extract via punning to appropriately sized integer mode
5462                  vector.  */
5463               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5464               tree etype = build_vector_type (eltype, 2);
5465               gcc_assert (convert_optab_handler (vec_extract_optab,
5466                                                  TYPE_MODE (etype),
5467                                                  TYPE_MODE (eltype))
5468                           != CODE_FOR_nothing);
5469               tree tem = make_ssa_name (etype);
5470               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5471                                                  build1 (VIEW_CONVERT_EXPR,
5472                                                          etype, new_temp));
5473               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5474               new_temp = tem;
5475               tem = make_ssa_name (eltype);
5476               epilog_stmt
5477                   = gimple_build_assign (tem, BIT_FIELD_REF,
5478                                          build3 (BIT_FIELD_REF, eltype,
5479                                                  new_temp, TYPE_SIZE (eltype),
5480                                                  bitsize_int (0)));
5481               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5482               dst1 = make_ssa_name (vectype1);
5483               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5484                                                  build1 (VIEW_CONVERT_EXPR,
5485                                                          vectype1, tem));
5486               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5487               tem = make_ssa_name (eltype);
5488               epilog_stmt
5489                   = gimple_build_assign (tem, BIT_FIELD_REF,
5490                                          build3 (BIT_FIELD_REF, eltype,
5491                                                  new_temp, TYPE_SIZE (eltype),
5492                                                  bitsize_int (bitsize)));
5493               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5494               dst2 =  make_ssa_name (vectype1);
5495               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5496                                                  build1 (VIEW_CONVERT_EXPR,
5497                                                          vectype1, tem));
5498               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5499             }
5500
5501           new_temp = make_ssa_name (vectype1);
5502           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5503           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5504           new_phis[0] = epilog_stmt;
5505         }
5506
5507       if (reduce_with_shift && !slp_reduc)
5508         {
5509           int element_bitsize = tree_to_uhwi (bitsize);
5510           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5511              for variable-length vectors and also requires direct target support
5512              for loop reductions.  */
5513           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5514           int nelements = vec_size_in_bits / element_bitsize;
5515           vec_perm_builder sel;
5516           vec_perm_indices indices;
5517
5518           int elt_offset;
5519
5520           tree zero_vec = build_zero_cst (vectype1);
5521           /* Case 2: Create:
5522              for (offset = nelements/2; offset >= 1; offset/=2)
5523                 {
5524                   Create:  va' = vec_shift <va, offset>
5525                   Create:  va = vop <va, va'>
5526                 }  */
5527
5528           tree rhs;
5529
5530           if (dump_enabled_p ())
5531             dump_printf_loc (MSG_NOTE, vect_location,
5532                              "Reduce using vector shifts\n");
5533
5534           gimple_seq stmts = NULL;
5535           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5536           for (elt_offset = nelements / 2;
5537                elt_offset >= 1;
5538                elt_offset /= 2)
5539             {
5540               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5541               indices.new_vector (sel, 2, nelements);
5542               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5543               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5544                                        new_temp, zero_vec, mask);
5545               new_temp = gimple_build (&stmts, code,
5546                                        vectype1, new_name, new_temp);
5547             }
5548           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5549
5550           /* 2.4  Extract the final scalar result.  Create:
5551              s_out3 = extract_field <v_out2, bitpos>  */
5552
5553           if (dump_enabled_p ())
5554             dump_printf_loc (MSG_NOTE, vect_location,
5555                              "extract scalar result\n");
5556
5557           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5558                         bitsize, bitsize_zero_node);
5559           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5560           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5561           gimple_assign_set_lhs (epilog_stmt, new_temp);
5562           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5563           scalar_results.safe_push (new_temp);
5564         }
5565       else
5566         {
5567           /* Case 3: Create:
5568              s = extract_field <v_out2, 0>
5569              for (offset = element_size;
5570                   offset < vector_size;
5571                   offset += element_size;)
5572                {
5573                  Create:  s' = extract_field <v_out2, offset>
5574                  Create:  s = op <s, s'>  // For non SLP cases
5575                }  */
5576
5577           if (dump_enabled_p ())
5578             dump_printf_loc (MSG_NOTE, vect_location,
5579                              "Reduce using scalar code.\n");
5580
5581           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5582           int element_bitsize = tree_to_uhwi (bitsize);
5583           tree compute_type = TREE_TYPE (vectype);
5584           gimple_seq stmts = NULL;
5585           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5586             {
5587               int bit_offset;
5588               if (gimple_code (new_phi) == GIMPLE_PHI)
5589                 vec_temp = PHI_RESULT (new_phi);
5590               else
5591                 vec_temp = gimple_assign_lhs (new_phi);
5592               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5593                                        vec_temp, bitsize, bitsize_zero_node);
5594
5595               /* In SLP we don't need to apply reduction operation, so we just
5596                  collect s' values in SCALAR_RESULTS.  */
5597               if (slp_reduc)
5598                 scalar_results.safe_push (new_temp);
5599
5600               for (bit_offset = element_bitsize;
5601                    bit_offset < vec_size_in_bits;
5602                    bit_offset += element_bitsize)
5603                 {
5604                   tree bitpos = bitsize_int (bit_offset);
5605                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5606                                            compute_type, vec_temp,
5607                                            bitsize, bitpos);
5608                   if (slp_reduc)
5609                     {
5610                       /* In SLP we don't need to apply reduction operation, so
5611                          we just collect s' values in SCALAR_RESULTS.  */
5612                       new_temp = new_name;
5613                       scalar_results.safe_push (new_name);
5614                     }
5615                   else
5616                     new_temp = gimple_build (&stmts, code, compute_type,
5617                                              new_name, new_temp);
5618                 }
5619             }
5620
5621           /* The only case where we need to reduce scalar results in SLP, is
5622              unrolling.  If the size of SCALAR_RESULTS is greater than
5623              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5624              REDUC_GROUP_SIZE.  */
5625           if (slp_reduc)
5626             {
5627               tree res, first_res, new_res;
5628
5629               /* Reduce multiple scalar results in case of SLP unrolling.  */
5630               for (j = group_size; scalar_results.iterate (j, &res);
5631                    j++)
5632                 {
5633                   first_res = scalar_results[j % group_size];
5634                   new_res = gimple_build (&stmts, code, compute_type,
5635                                           first_res, res);
5636                   scalar_results[j % group_size] = new_res;
5637                 }
5638               for (k = 0; k < group_size; k++)
5639                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5640                                                     scalar_results[k]);
5641             }
5642           else
5643             {
5644               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5645               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5646               scalar_results.safe_push (new_temp);
5647             }
5648
5649           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5650         }
5651
5652       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5653           && induc_val)
5654         {
5655           /* Earlier we set the initial value to be a vector if induc_val
5656              values.  Check the result and if it is induc_val then replace
5657              with the original initial value, unless induc_val is
5658              the same as initial_def already.  */
5659           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5660                                   induc_val);
5661
5662           tree tmp = make_ssa_name (new_scalar_dest);
5663           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5664                                              initial_def, new_temp);
5665           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5666           scalar_results[0] = tmp;
5667         }
5668     }
5669
5670   /* 2.5 Adjust the final result by the initial value of the reduction
5671          variable. (When such adjustment is not needed, then
5672          'adjustment_def' is zero).  For example, if code is PLUS we create:
5673          new_temp = loop_exit_def + adjustment_def  */
5674
5675   if (adjustment_def)
5676     {
5677       gcc_assert (!slp_reduc);
5678       gimple_seq stmts = NULL;
5679       if (nested_in_vect_loop)
5680         {
5681           new_phi = new_phis[0];
5682           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5683           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5684           new_temp = gimple_build (&stmts, code, vectype,
5685                                    PHI_RESULT (new_phi), adjustment_def);
5686         }
5687       else
5688         {
5689           new_temp = scalar_results[0];
5690           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5691           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5692           new_temp = gimple_build (&stmts, code, scalar_type,
5693                                    new_temp, adjustment_def);
5694         }
5695
5696       epilog_stmt = gimple_seq_last_stmt (stmts);
5697       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5698       if (nested_in_vect_loop)
5699         {
5700           if (!double_reduc)
5701             scalar_results.quick_push (new_temp);
5702           else
5703             scalar_results[0] = new_temp;
5704         }
5705       else
5706         scalar_results[0] = new_temp;
5707
5708       new_phis[0] = epilog_stmt;
5709     }
5710
5711   if (double_reduc)
5712     loop = loop->inner;
5713
5714   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5715           phis with new adjusted scalar results, i.e., replace use <s_out0>
5716           with use <s_out4>.
5717
5718      Transform:
5719         loop_exit:
5720           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5721           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5722           v_out2 = reduce <v_out1>
5723           s_out3 = extract_field <v_out2, 0>
5724           s_out4 = adjust_result <s_out3>
5725           use <s_out0>
5726           use <s_out0>
5727
5728      into:
5729
5730         loop_exit:
5731           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5732           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5733           v_out2 = reduce <v_out1>
5734           s_out3 = extract_field <v_out2, 0>
5735           s_out4 = adjust_result <s_out3>
5736           use <s_out4>
5737           use <s_out4> */
5738
5739
5740   /* In SLP reduction chain we reduce vector results into one vector if
5741      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5742      LHS of the last stmt in the reduction chain, since we are looking for
5743      the loop exit phi node.  */
5744   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5745     {
5746       stmt_vec_info dest_stmt_info
5747         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5748       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5749       group_size = 1;
5750     }
5751
5752   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5753      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5754      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5755      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5756      correspond to the first vector stmt, etc.
5757      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5758   if (group_size > new_phis.length ())
5759     gcc_assert (!(group_size % new_phis.length ()));
5760
5761   for (k = 0; k < group_size; k++)
5762     {
5763       if (slp_reduc)
5764         {
5765           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5766
5767           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5768           /* SLP statements can't participate in patterns.  */
5769           gcc_assert (!orig_stmt_info);
5770           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5771         }
5772
5773       if (nested_in_vect_loop)
5774         {
5775           if (double_reduc)
5776             loop = outer_loop;
5777           else
5778             gcc_unreachable ();
5779         }
5780
5781       phis.create (3);
5782       /* Find the loop-closed-use at the loop exit of the original scalar
5783          result.  (The reduction result is expected to have two immediate uses,
5784          one at the latch block, and one at the loop exit).  For double
5785          reductions we are looking for exit phis of the outer loop.  */
5786       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5787         {
5788           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5789             {
5790               if (!is_gimple_debug (USE_STMT (use_p)))
5791                 phis.safe_push (USE_STMT (use_p));
5792             }
5793           else
5794             {
5795               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5796                 {
5797                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5798
5799                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5800                     {
5801                       if (!flow_bb_inside_loop_p (loop,
5802                                              gimple_bb (USE_STMT (phi_use_p)))
5803                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5804                         phis.safe_push (USE_STMT (phi_use_p));
5805                     }
5806                 }
5807             }
5808         }
5809
5810       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5811         {
5812           /* Replace the uses:  */
5813           orig_name = PHI_RESULT (exit_phi);
5814           scalar_result = scalar_results[k];
5815           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5816             {
5817               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5818                 SET_USE (use_p, scalar_result);
5819               update_stmt (use_stmt);
5820             }
5821         }
5822
5823       phis.release ();
5824     }
5825 }
5826
5827 /* Return a vector of type VECTYPE that is equal to the vector select
5828    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5829    before GSI.  */
5830
5831 static tree
5832 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5833                      tree vec, tree identity)
5834 {
5835   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5836   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5837                                           mask, vec, identity);
5838   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5839   return cond;
5840 }
5841
5842 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5843    order, starting with LHS.  Insert the extraction statements before GSI and
5844    associate the new scalar SSA names with variable SCALAR_DEST.
5845    Return the SSA name for the result.  */
5846
5847 static tree
5848 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5849                        tree_code code, tree lhs, tree vector_rhs)
5850 {
5851   tree vectype = TREE_TYPE (vector_rhs);
5852   tree scalar_type = TREE_TYPE (vectype);
5853   tree bitsize = TYPE_SIZE (scalar_type);
5854   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5855   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5856
5857   for (unsigned HOST_WIDE_INT bit_offset = 0;
5858        bit_offset < vec_size_in_bits;
5859        bit_offset += element_bitsize)
5860     {
5861       tree bitpos = bitsize_int (bit_offset);
5862       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5863                          bitsize, bitpos);
5864
5865       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5866       rhs = make_ssa_name (scalar_dest, stmt);
5867       gimple_assign_set_lhs (stmt, rhs);
5868       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5869
5870       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5871       tree new_name = make_ssa_name (scalar_dest, stmt);
5872       gimple_assign_set_lhs (stmt, new_name);
5873       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5874       lhs = new_name;
5875     }
5876   return lhs;
5877 }
5878
5879 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5880    type of the vector input.  */
5881
5882 static internal_fn
5883 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5884 {
5885   internal_fn mask_reduc_fn;
5886
5887   switch (reduc_fn)
5888     {
5889     case IFN_FOLD_LEFT_PLUS:
5890       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5891       break;
5892
5893     default:
5894       return IFN_LAST;
5895     }
5896
5897   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5898                                       OPTIMIZE_FOR_SPEED))
5899     return mask_reduc_fn;
5900   return IFN_LAST;
5901 }
5902
5903 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5904    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5905    statement.  CODE is the operation performed by STMT_INFO and OPS are
5906    its scalar operands.  REDUC_INDEX is the index of the operand in
5907    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5908    implements in-order reduction, or IFN_LAST if we should open-code it.
5909    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5910    that should be used to control the operation in a fully-masked loop.  */
5911
5912 static bool
5913 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5914                                stmt_vec_info stmt_info,
5915                                gimple_stmt_iterator *gsi,
5916                                gimple **vec_stmt, slp_tree slp_node,
5917                                gimple *reduc_def_stmt,
5918                                tree_code code, internal_fn reduc_fn,
5919                                tree ops[3], tree vectype_in,
5920                                int reduc_index, vec_loop_masks *masks)
5921 {
5922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5923   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5924   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5925
5926   int ncopies;
5927   if (slp_node)
5928     ncopies = 1;
5929   else
5930     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5931
5932   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5933   gcc_assert (ncopies == 1);
5934   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5935
5936   if (slp_node)
5937     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5938                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5939
5940   tree op0 = ops[1 - reduc_index];
5941
5942   int group_size = 1;
5943   stmt_vec_info scalar_dest_def_info;
5944   auto_vec<tree> vec_oprnds0;
5945   if (slp_node)
5946     {
5947       auto_vec<vec<tree> > vec_defs (2);
5948       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5949       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5950       vec_defs[0].release ();
5951       vec_defs[1].release ();
5952       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5953       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5954     }
5955   else
5956     {
5957       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5958                                      op0, &vec_oprnds0);
5959       scalar_dest_def_info = stmt_info;
5960     }
5961
5962   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5963   tree scalar_type = TREE_TYPE (scalar_dest);
5964   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5965
5966   int vec_num = vec_oprnds0.length ();
5967   gcc_assert (vec_num == 1 || slp_node);
5968   tree vec_elem_type = TREE_TYPE (vectype_out);
5969   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5970
5971   tree vector_identity = NULL_TREE;
5972   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5973     vector_identity = build_zero_cst (vectype_out);
5974
5975   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5976   int i;
5977   tree def0;
5978   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5979     {
5980       gimple *new_stmt;
5981       tree mask = NULL_TREE;
5982       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5983         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5984
5985       /* Handle MINUS by adding the negative.  */
5986       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5987         {
5988           tree negated = make_ssa_name (vectype_out);
5989           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5990           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5991           def0 = negated;
5992         }
5993
5994       if (mask && mask_reduc_fn == IFN_LAST)
5995         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5996                                     vector_identity);
5997
5998       /* On the first iteration the input is simply the scalar phi
5999          result, and for subsequent iterations it is the output of
6000          the preceding operation.  */
6001       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6002         {
6003           if (mask && mask_reduc_fn != IFN_LAST)
6004             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6005                                                    def0, mask);
6006           else
6007             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6008                                                    def0);
6009           /* For chained SLP reductions the output of the previous reduction
6010              operation serves as the input of the next. For the final statement
6011              the output cannot be a temporary - we reuse the original
6012              scalar destination of the last statement.  */
6013           if (i != vec_num - 1)
6014             {
6015               gimple_set_lhs (new_stmt, scalar_dest_var);
6016               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6017               gimple_set_lhs (new_stmt, reduc_var);
6018             }
6019         }
6020       else
6021         {
6022           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6023                                              reduc_var, def0);
6024           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6025           /* Remove the statement, so that we can use the same code paths
6026              as for statements that we've just created.  */
6027           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6028           gsi_remove (&tmp_gsi, true);
6029         }
6030
6031       if (i == vec_num - 1)
6032         {
6033           gimple_set_lhs (new_stmt, scalar_dest);
6034           vect_finish_replace_stmt (loop_vinfo,
6035                                     scalar_dest_def_info,
6036                                     new_stmt);
6037         }
6038       else
6039         vect_finish_stmt_generation (loop_vinfo,
6040                                      scalar_dest_def_info,
6041                                      new_stmt, gsi);
6042
6043       if (slp_node)
6044         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6045       else
6046         {
6047           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6048           *vec_stmt = new_stmt;
6049         }
6050     }
6051
6052   return true;
6053 }
6054
6055 /* Function is_nonwrapping_integer_induction.
6056
6057    Check if STMT_VINO (which is part of loop LOOP) both increments and
6058    does not cause overflow.  */
6059
6060 static bool
6061 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6062 {
6063   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6064   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6065   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6066   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6067   widest_int ni, max_loop_value, lhs_max;
6068   wi::overflow_type overflow = wi::OVF_NONE;
6069
6070   /* Make sure the loop is integer based.  */
6071   if (TREE_CODE (base) != INTEGER_CST
6072       || TREE_CODE (step) != INTEGER_CST)
6073     return false;
6074
6075   /* Check that the max size of the loop will not wrap.  */
6076
6077   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6078     return true;
6079
6080   if (! max_stmt_executions (loop, &ni))
6081     return false;
6082
6083   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6084                             &overflow);
6085   if (overflow)
6086     return false;
6087
6088   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6089                             TYPE_SIGN (lhs_type), &overflow);
6090   if (overflow)
6091     return false;
6092
6093   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6094           <= TYPE_PRECISION (lhs_type));
6095 }
6096
6097 /* Check if masking can be supported by inserting a conditional expression.
6098    CODE is the code for the operation.  COND_FN is the conditional internal
6099    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6100 static bool
6101 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6102                          tree vectype_in)
6103 {
6104   if (cond_fn != IFN_LAST
6105       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6106                                          OPTIMIZE_FOR_SPEED))
6107     return false;
6108
6109   switch (code)
6110     {
6111     case DOT_PROD_EXPR:
6112     case SAD_EXPR:
6113       return true;
6114
6115     default:
6116       return false;
6117     }
6118 }
6119
6120 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6121    code for the operation.  VOP is the array of operands.  MASK is the loop
6122    mask.  GSI is a statement iterator used to place the new conditional
6123    expression.  */
6124 static void
6125 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6126                       gimple_stmt_iterator *gsi)
6127 {
6128   switch (code)
6129     {
6130     case DOT_PROD_EXPR:
6131       {
6132         tree vectype = TREE_TYPE (vop[1]);
6133         tree zero = build_zero_cst (vectype);
6134         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6135         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6136                                                mask, vop[1], zero);
6137         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6138         vop[1] = masked_op1;
6139         break;
6140       }
6141
6142     case SAD_EXPR:
6143       {
6144         tree vectype = TREE_TYPE (vop[1]);
6145         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6146         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6147                                                mask, vop[1], vop[0]);
6148         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6149         vop[1] = masked_op1;
6150         break;
6151       }
6152
6153     default:
6154       gcc_unreachable ();
6155     }
6156 }
6157
6158 /* Function vectorizable_reduction.
6159
6160    Check if STMT_INFO performs a reduction operation that can be vectorized.
6161    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6162    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6163    Return true if STMT_INFO is vectorizable in this way.
6164
6165    This function also handles reduction idioms (patterns) that have been
6166    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6167    may be of this form:
6168      X = pattern_expr (arg0, arg1, ..., X)
6169    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6170    sequence that had been detected and replaced by the pattern-stmt
6171    (STMT_INFO).
6172
6173    This function also handles reduction of condition expressions, for example:
6174      for (int i = 0; i < N; i++)
6175        if (a[i] < value)
6176          last = a[i];
6177    This is handled by vectorising the loop and creating an additional vector
6178    containing the loop indexes for which "a[i] < value" was true.  In the
6179    function epilogue this is reduced to a single max value and then used to
6180    index into the vector of results.
6181
6182    In some cases of reduction patterns, the type of the reduction variable X is
6183    different than the type of the other arguments of STMT_INFO.
6184    In such cases, the vectype that is used when transforming STMT_INFO into
6185    a vector stmt is different than the vectype that is used to determine the
6186    vectorization factor, because it consists of a different number of elements
6187    than the actual number of elements that are being operated upon in parallel.
6188
6189    For example, consider an accumulation of shorts into an int accumulator.
6190    On some targets it's possible to vectorize this pattern operating on 8
6191    shorts at a time (hence, the vectype for purposes of determining the
6192    vectorization factor should be V8HI); on the other hand, the vectype that
6193    is used to create the vector form is actually V4SI (the type of the result).
6194
6195    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6196    indicates what is the actual level of parallelism (V8HI in the example), so
6197    that the right vectorization factor would be derived.  This vectype
6198    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6199    be used to create the vectorized stmt.  The right vectype for the vectorized
6200    stmt is obtained from the type of the result X:
6201       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6202
6203    This means that, contrary to "regular" reductions (or "regular" stmts in
6204    general), the following equation:
6205       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6206    does *NOT* necessarily hold for reduction patterns.  */
6207
6208 bool
6209 vectorizable_reduction (loop_vec_info loop_vinfo,
6210                         stmt_vec_info stmt_info, slp_tree slp_node,
6211                         slp_instance slp_node_instance,
6212                         stmt_vector_for_cost *cost_vec)
6213 {
6214   tree scalar_dest;
6215   tree vectype_in = NULL_TREE;
6216   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6217   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6218   stmt_vec_info cond_stmt_vinfo = NULL;
6219   tree scalar_type;
6220   int i;
6221   int ncopies;
6222   bool single_defuse_cycle = false;
6223   bool nested_cycle = false;
6224   bool double_reduc = false;
6225   int vec_num;
6226   tree tem;
6227   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6228   tree cond_reduc_val = NULL_TREE;
6229
6230   /* Make sure it was already recognized as a reduction computation.  */
6231   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6232       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6233       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6234     return false;
6235
6236   /* The stmt we store reduction analysis meta on.  */
6237   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6238   reduc_info->is_reduc_info = true;
6239
6240   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6241     {
6242       if (is_a <gphi *> (stmt_info->stmt))
6243         /* Analysis for double-reduction is done on the outer
6244            loop PHI, nested cycles have no further restrictions.  */
6245         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6246       else
6247         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6248       return true;
6249     }
6250
6251   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6252   stmt_vec_info phi_info = stmt_info;
6253   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6254       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6255     {
6256       if (!is_a <gphi *> (stmt_info->stmt))
6257         {
6258           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6259           return true;
6260         }
6261       if (slp_node)
6262         {
6263           slp_node_instance->reduc_phis = slp_node;
6264           /* ???  We're leaving slp_node to point to the PHIs, we only
6265              need it to get at the number of vector stmts which wasn't
6266              yet initialized for the instance root.  */
6267         }
6268       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6269         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6270       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6271         {
6272           use_operand_p use_p;
6273           gimple *use_stmt;
6274           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6275                                      &use_p, &use_stmt);
6276           gcc_assert (res);
6277           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6278           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6279         }
6280     }
6281
6282   /* PHIs should not participate in patterns.  */
6283   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6284   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6285
6286   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6287      and compute the reduction chain length.  */
6288   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6289                                           loop_latch_edge (loop));
6290   unsigned reduc_chain_length = 0;
6291   bool only_slp_reduc_chain = true;
6292   stmt_info = NULL;
6293   while (reduc_def != PHI_RESULT (reduc_def_phi))
6294     {
6295       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6296       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6297       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6298         {
6299           if (dump_enabled_p ())
6300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6301                              "reduction chain broken by patterns.\n");
6302           return false;
6303         }
6304       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6305         only_slp_reduc_chain = false;
6306       /* ???  For epilogue generation live members of the chain need
6307          to point back to the PHI via their original stmt for
6308          info_for_reduction to work.  */
6309       if (STMT_VINFO_LIVE_P (vdef))
6310         STMT_VINFO_REDUC_DEF (def) = phi_info;
6311       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6312       if (!assign)
6313         {
6314           if (dump_enabled_p ())
6315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6316                              "reduction chain includes calls.\n");
6317           return false;
6318         }
6319       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6320         {
6321           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6322                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6323             {
6324               if (dump_enabled_p ())
6325                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6326                                  "conversion in the reduction chain.\n");
6327               return false;
6328             }
6329         }
6330       else if (!stmt_info)
6331         /* First non-conversion stmt.  */
6332         stmt_info = vdef;
6333       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6334       reduc_chain_length++;
6335     }
6336   /* PHIs should not participate in patterns.  */
6337   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6338
6339   if (nested_in_vect_loop_p (loop, stmt_info))
6340     {
6341       loop = loop->inner;
6342       nested_cycle = true;
6343     }
6344
6345   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6346      element.  */
6347   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6348     {
6349       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6350       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6351     }
6352   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6353     gcc_assert (slp_node
6354                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6355
6356   /* 1. Is vectorizable reduction?  */
6357   /* Not supportable if the reduction variable is used in the loop, unless
6358      it's a reduction chain.  */
6359   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6360       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6361     return false;
6362
6363   /* Reductions that are not used even in an enclosing outer-loop,
6364      are expected to be "live" (used out of the loop).  */
6365   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6366       && !STMT_VINFO_LIVE_P (stmt_info))
6367     return false;
6368
6369   /* 2. Has this been recognized as a reduction pattern?
6370
6371      Check if STMT represents a pattern that has been recognized
6372      in earlier analysis stages.  For stmts that represent a pattern,
6373      the STMT_VINFO_RELATED_STMT field records the last stmt in
6374      the original sequence that constitutes the pattern.  */
6375
6376   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6377   if (orig_stmt_info)
6378     {
6379       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6380       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6381     }
6382
6383   /* 3. Check the operands of the operation.  The first operands are defined
6384         inside the loop body. The last operand is the reduction variable,
6385         which is defined by the loop-header-phi.  */
6386
6387   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6388   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6389   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6390   enum tree_code code = gimple_assign_rhs_code (stmt);
6391   bool lane_reduc_code_p
6392     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6393   int op_type = TREE_CODE_LENGTH (code);
6394
6395   scalar_dest = gimple_assign_lhs (stmt);
6396   scalar_type = TREE_TYPE (scalar_dest);
6397   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6398       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6399     return false;
6400
6401   /* Do not try to vectorize bit-precision reductions.  */
6402   if (!type_has_mode_precision_p (scalar_type))
6403     return false;
6404
6405   /* For lane-reducing ops we're reducing the number of reduction PHIs
6406      which means the only use of that may be in the lane-reducing operation.  */
6407   if (lane_reduc_code_p
6408       && reduc_chain_length != 1
6409       && !only_slp_reduc_chain)
6410     {
6411       if (dump_enabled_p ())
6412         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6413                          "lane-reducing reduction with extra stmts.\n");
6414       return false;
6415     }
6416
6417   /* All uses but the last are expected to be defined in the loop.
6418      The last use is the reduction variable.  In case of nested cycle this
6419      assumption is not true: we use reduc_index to record the index of the
6420      reduction variable.  */
6421   /* ???  To get at invariant/constant uses on the SLP node we have to
6422      get to it here, slp_node is still the reduction PHI.  */
6423   slp_tree slp_for_stmt_info = NULL;
6424   if (slp_node)
6425     {
6426       slp_for_stmt_info = slp_node_instance->root;
6427       /* And then there's reduction chain with a conversion ...  */
6428       if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6429         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6430       gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6431     }
6432   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6433   /* We need to skip an extra operand for COND_EXPRs with embedded
6434      comparison.  */
6435   unsigned opno_adjust = 0;
6436   if (code == COND_EXPR
6437       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6438     opno_adjust = 1;
6439   for (i = 0; i < op_type; i++)
6440     {
6441       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6442       if (i == 0 && code == COND_EXPR)
6443         continue;
6444
6445       stmt_vec_info def_stmt_info;
6446       enum vect_def_type dt;
6447       tree op;
6448       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6449                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6450                                &def_stmt_info))
6451         {
6452           if (dump_enabled_p ())
6453             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454                              "use not simple.\n");
6455           return false;
6456         }
6457       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6458         continue;
6459
6460       /* There should be only one cycle def in the stmt, the one
6461          leading to reduc_def.  */
6462       if (VECTORIZABLE_CYCLE_DEF (dt))
6463         return false;
6464
6465       /* To properly compute ncopies we are interested in the widest
6466          non-reduction input type in case we're looking at a widening
6467          accumulation that we later handle in vect_transform_reduction.  */
6468       if (lane_reduc_code_p
6469           && tem
6470           && (!vectype_in
6471               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6472                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6473         vectype_in = tem;
6474
6475       if (code == COND_EXPR)
6476         {
6477           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6478           if (dt == vect_constant_def)
6479             {
6480               cond_reduc_dt = dt;
6481               cond_reduc_val = op;
6482             }
6483           if (dt == vect_induction_def
6484               && def_stmt_info
6485               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6486             {
6487               cond_reduc_dt = dt;
6488               cond_stmt_vinfo = def_stmt_info;
6489             }
6490         }
6491     }
6492   if (!vectype_in)
6493     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6494   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6495
6496   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6497   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6498   /* If we have a condition reduction, see if we can simplify it further.  */
6499   if (v_reduc_type == COND_REDUCTION)
6500     {
6501       if (slp_node)
6502         return false;
6503
6504       /* When the condition uses the reduction value in the condition, fail.  */
6505       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6506         {
6507           if (dump_enabled_p ())
6508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6509                              "condition depends on previous iteration\n");
6510           return false;
6511         }
6512
6513       if (reduc_chain_length == 1
6514           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6515                                              vectype_in, OPTIMIZE_FOR_SPEED))
6516         {
6517           if (dump_enabled_p ())
6518             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6519                              "optimizing condition reduction with"
6520                              " FOLD_EXTRACT_LAST.\n");
6521           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6522         }
6523       else if (cond_reduc_dt == vect_induction_def)
6524         {
6525           tree base
6526             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6527           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6528
6529           gcc_assert (TREE_CODE (base) == INTEGER_CST
6530                       && TREE_CODE (step) == INTEGER_CST);
6531           cond_reduc_val = NULL_TREE;
6532           enum tree_code cond_reduc_op_code = ERROR_MARK;
6533           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6534           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6535             ;
6536           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6537              above base; punt if base is the minimum value of the type for
6538              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6539           else if (tree_int_cst_sgn (step) == -1)
6540             {
6541               cond_reduc_op_code = MIN_EXPR;
6542               if (tree_int_cst_sgn (base) == -1)
6543                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6544               else if (tree_int_cst_lt (base,
6545                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6546                 cond_reduc_val
6547                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6548             }
6549           else
6550             {
6551               cond_reduc_op_code = MAX_EXPR;
6552               if (tree_int_cst_sgn (base) == 1)
6553                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6554               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6555                                         base))
6556                 cond_reduc_val
6557                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6558             }
6559           if (cond_reduc_val)
6560             {
6561               if (dump_enabled_p ())
6562                 dump_printf_loc (MSG_NOTE, vect_location,
6563                                  "condition expression based on "
6564                                  "integer induction.\n");
6565               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6566               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6567                 = cond_reduc_val;
6568               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6569             }
6570         }
6571       else if (cond_reduc_dt == vect_constant_def)
6572         {
6573           enum vect_def_type cond_initial_dt;
6574           tree cond_initial_val
6575             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6576
6577           gcc_assert (cond_reduc_val != NULL_TREE);
6578           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6579           if (cond_initial_dt == vect_constant_def
6580               && types_compatible_p (TREE_TYPE (cond_initial_val),
6581                                      TREE_TYPE (cond_reduc_val)))
6582             {
6583               tree e = fold_binary (LE_EXPR, boolean_type_node,
6584                                     cond_initial_val, cond_reduc_val);
6585               if (e && (integer_onep (e) || integer_zerop (e)))
6586                 {
6587                   if (dump_enabled_p ())
6588                     dump_printf_loc (MSG_NOTE, vect_location,
6589                                      "condition expression based on "
6590                                      "compile time constant.\n");
6591                   /* Record reduction code at analysis stage.  */
6592                   STMT_VINFO_REDUC_CODE (reduc_info)
6593                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6594                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6595                 }
6596             }
6597         }
6598     }
6599
6600   if (STMT_VINFO_LIVE_P (phi_info))
6601     return false;
6602
6603   if (slp_node)
6604     ncopies = 1;
6605   else
6606     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6607
6608   gcc_assert (ncopies >= 1);
6609
6610   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6611
6612   if (nested_cycle)
6613     {
6614       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6615                   == vect_double_reduction_def);
6616       double_reduc = true;
6617     }
6618
6619   /* 4.2. Check support for the epilog operation.
6620
6621           If STMT represents a reduction pattern, then the type of the
6622           reduction variable may be different than the type of the rest
6623           of the arguments.  For example, consider the case of accumulation
6624           of shorts into an int accumulator; The original code:
6625                         S1: int_a = (int) short_a;
6626           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6627
6628           was replaced with:
6629                         STMT: int_acc = widen_sum <short_a, int_acc>
6630
6631           This means that:
6632           1. The tree-code that is used to create the vector operation in the
6633              epilog code (that reduces the partial results) is not the
6634              tree-code of STMT, but is rather the tree-code of the original
6635              stmt from the pattern that STMT is replacing.  I.e, in the example
6636              above we want to use 'widen_sum' in the loop, but 'plus' in the
6637              epilog.
6638           2. The type (mode) we use to check available target support
6639              for the vector operation to be created in the *epilog*, is
6640              determined by the type of the reduction variable (in the example
6641              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6642              However the type (mode) we use to check available target support
6643              for the vector operation to be created *inside the loop*, is
6644              determined by the type of the other arguments to STMT (in the
6645              example we'd check this: optab_handler (widen_sum_optab,
6646              vect_short_mode)).
6647
6648           This is contrary to "regular" reductions, in which the types of all
6649           the arguments are the same as the type of the reduction variable.
6650           For "regular" reductions we can therefore use the same vector type
6651           (and also the same tree-code) when generating the epilog code and
6652           when generating the code inside the loop.  */
6653
6654   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6655   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6656
6657   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6658   if (reduction_type == TREE_CODE_REDUCTION)
6659     {
6660       /* Check whether it's ok to change the order of the computation.
6661          Generally, when vectorizing a reduction we change the order of the
6662          computation.  This may change the behavior of the program in some
6663          cases, so we need to check that this is ok.  One exception is when
6664          vectorizing an outer-loop: the inner-loop is executed sequentially,
6665          and therefore vectorizing reductions in the inner-loop during
6666          outer-loop vectorization is safe.  */
6667       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6668         {
6669           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6670              is not directy used in stmt.  */
6671           if (!only_slp_reduc_chain
6672               && reduc_chain_length != 1)
6673             {
6674               if (dump_enabled_p ())
6675                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6676                                  "in-order reduction chain without SLP.\n");
6677               return false;
6678             }
6679           STMT_VINFO_REDUC_TYPE (reduc_info)
6680             = reduction_type = FOLD_LEFT_REDUCTION;
6681         }
6682       else if (!commutative_tree_code (orig_code)
6683                || !associative_tree_code (orig_code))
6684         {
6685           if (dump_enabled_p ())
6686             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6687                             "reduction: not commutative/associative");
6688           return false;
6689         }
6690     }
6691
6692   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6693       && ncopies > 1)
6694     {
6695       if (dump_enabled_p ())
6696         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6697                          "multiple types in double reduction or condition "
6698                          "reduction or fold-left reduction.\n");
6699       return false;
6700     }
6701
6702   internal_fn reduc_fn = IFN_LAST;
6703   if (reduction_type == TREE_CODE_REDUCTION
6704       || reduction_type == FOLD_LEFT_REDUCTION
6705       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6706       || reduction_type == CONST_COND_REDUCTION)
6707     {
6708       if (reduction_type == FOLD_LEFT_REDUCTION
6709           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6710           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6711         {
6712           if (reduc_fn != IFN_LAST
6713               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6714                                                   OPTIMIZE_FOR_SPEED))
6715             {
6716               if (dump_enabled_p ())
6717                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6718                                  "reduc op not supported by target.\n");
6719
6720               reduc_fn = IFN_LAST;
6721             }
6722         }
6723       else
6724         {
6725           if (!nested_cycle || double_reduc)
6726             {
6727               if (dump_enabled_p ())
6728                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6729                                  "no reduc code for scalar code.\n");
6730
6731               return false;
6732             }
6733         }
6734     }
6735   else if (reduction_type == COND_REDUCTION)
6736     {
6737       int scalar_precision
6738         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6739       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6740       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6741                                                 nunits_out);
6742
6743       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6744                                           OPTIMIZE_FOR_SPEED))
6745         reduc_fn = IFN_REDUC_MAX;
6746     }
6747   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6748
6749   if (reduction_type != EXTRACT_LAST_REDUCTION
6750       && (!nested_cycle || double_reduc)
6751       && reduc_fn == IFN_LAST
6752       && !nunits_out.is_constant ())
6753     {
6754       if (dump_enabled_p ())
6755         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756                          "missing target support for reduction on"
6757                          " variable-length vectors.\n");
6758       return false;
6759     }
6760
6761   /* For SLP reductions, see if there is a neutral value we can use.  */
6762   tree neutral_op = NULL_TREE;
6763   if (slp_node)
6764     neutral_op = neutral_op_for_slp_reduction
6765       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6766        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6767
6768   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6769     {
6770       /* We can't support in-order reductions of code such as this:
6771
6772            for (int i = 0; i < n1; ++i)
6773              for (int j = 0; j < n2; ++j)
6774                l += a[j];
6775
6776          since GCC effectively transforms the loop when vectorizing:
6777
6778            for (int i = 0; i < n1 / VF; ++i)
6779              for (int j = 0; j < n2; ++j)
6780                for (int k = 0; k < VF; ++k)
6781                  l += a[j];
6782
6783          which is a reassociation of the original operation.  */
6784       if (dump_enabled_p ())
6785         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786                          "in-order double reduction not supported.\n");
6787
6788       return false;
6789     }
6790
6791   if (reduction_type == FOLD_LEFT_REDUCTION
6792       && slp_node
6793       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6794     {
6795       /* We cannot use in-order reductions in this case because there is
6796          an implicit reassociation of the operations involved.  */
6797       if (dump_enabled_p ())
6798         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799                          "in-order unchained SLP reductions not supported.\n");
6800       return false;
6801     }
6802
6803   /* For double reductions, and for SLP reductions with a neutral value,
6804      we construct a variable-length initial vector by loading a vector
6805      full of the neutral value and then shift-and-inserting the start
6806      values into the low-numbered elements.  */
6807   if ((double_reduc || neutral_op)
6808       && !nunits_out.is_constant ()
6809       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6810                                           vectype_out, OPTIMIZE_FOR_SPEED))
6811     {
6812       if (dump_enabled_p ())
6813         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814                          "reduction on variable-length vectors requires"
6815                          " target support for a vector-shift-and-insert"
6816                          " operation.\n");
6817       return false;
6818     }
6819
6820   /* Check extra constraints for variable-length unchained SLP reductions.  */
6821   if (STMT_SLP_TYPE (stmt_info)
6822       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6823       && !nunits_out.is_constant ())
6824     {
6825       /* We checked above that we could build the initial vector when
6826          there's a neutral element value.  Check here for the case in
6827          which each SLP statement has its own initial value and in which
6828          that value needs to be repeated for every instance of the
6829          statement within the initial vector.  */
6830       unsigned int group_size = SLP_TREE_LANES (slp_node);
6831       if (!neutral_op
6832           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6833                                               TREE_TYPE (vectype_out)))
6834         {
6835           if (dump_enabled_p ())
6836             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837                              "unsupported form of SLP reduction for"
6838                              " variable-length vectors: cannot build"
6839                              " initial vector.\n");
6840           return false;
6841         }
6842       /* The epilogue code relies on the number of elements being a multiple
6843          of the group size.  The duplicate-and-interleave approach to setting
6844          up the initial vector does too.  */
6845       if (!multiple_p (nunits_out, group_size))
6846         {
6847           if (dump_enabled_p ())
6848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849                              "unsupported form of SLP reduction for"
6850                              " variable-length vectors: the vector size"
6851                              " is not a multiple of the number of results.\n");
6852           return false;
6853         }
6854     }
6855
6856   if (reduction_type == COND_REDUCTION)
6857     {
6858       widest_int ni;
6859
6860       if (! max_loop_iterations (loop, &ni))
6861         {
6862           if (dump_enabled_p ())
6863             dump_printf_loc (MSG_NOTE, vect_location,
6864                              "loop count not known, cannot create cond "
6865                              "reduction.\n");
6866           return false;
6867         }
6868       /* Convert backedges to iterations.  */
6869       ni += 1;
6870
6871       /* The additional index will be the same type as the condition.  Check
6872          that the loop can fit into this less one (because we'll use up the
6873          zero slot for when there are no matches).  */
6874       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6875       if (wi::geu_p (ni, wi::to_widest (max_index)))
6876         {
6877           if (dump_enabled_p ())
6878             dump_printf_loc (MSG_NOTE, vect_location,
6879                              "loop size is greater than data size.\n");
6880           return false;
6881         }
6882     }
6883
6884   /* In case the vectorization factor (VF) is bigger than the number
6885      of elements that we can fit in a vectype (nunits), we have to generate
6886      more than one vector stmt - i.e - we need to "unroll" the
6887      vector stmt by a factor VF/nunits.  For more details see documentation
6888      in vectorizable_operation.  */
6889
6890   /* If the reduction is used in an outer loop we need to generate
6891      VF intermediate results, like so (e.g. for ncopies=2):
6892         r0 = phi (init, r0)
6893         r1 = phi (init, r1)
6894         r0 = x0 + r0;
6895         r1 = x1 + r1;
6896     (i.e. we generate VF results in 2 registers).
6897     In this case we have a separate def-use cycle for each copy, and therefore
6898     for each copy we get the vector def for the reduction variable from the
6899     respective phi node created for this copy.
6900
6901     Otherwise (the reduction is unused in the loop nest), we can combine
6902     together intermediate results, like so (e.g. for ncopies=2):
6903         r = phi (init, r)
6904         r = x0 + r;
6905         r = x1 + r;
6906    (i.e. we generate VF/2 results in a single register).
6907    In this case for each copy we get the vector def for the reduction variable
6908    from the vectorized reduction operation generated in the previous iteration.
6909
6910    This only works when we see both the reduction PHI and its only consumer
6911    in vectorizable_reduction and there are no intermediate stmts
6912    participating.  */
6913   if (ncopies > 1
6914       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6915       && reduc_chain_length == 1)
6916     single_defuse_cycle = true;
6917
6918   if (single_defuse_cycle || lane_reduc_code_p)
6919     {
6920       gcc_assert (code != COND_EXPR);
6921
6922       /* 4. Supportable by target?  */
6923       bool ok = true;
6924
6925       /* 4.1. check support for the operation in the loop  */
6926       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6927       if (!optab)
6928         {
6929           if (dump_enabled_p ())
6930             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931                              "no optab.\n");
6932           ok = false;
6933         }
6934
6935       machine_mode vec_mode = TYPE_MODE (vectype_in);
6936       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6937         {
6938           if (dump_enabled_p ())
6939             dump_printf (MSG_NOTE, "op not supported by target.\n");
6940           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6941               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6942             ok = false;
6943           else
6944             if (dump_enabled_p ())
6945               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6946         }
6947
6948       /* Worthwhile without SIMD support?  */
6949       if (ok
6950           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6951           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6952         {
6953           if (dump_enabled_p ())
6954             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6955                              "not worthwhile without SIMD support.\n");
6956           ok = false;
6957         }
6958
6959       /* lane-reducing operations have to go through vect_transform_reduction.
6960          For the other cases try without the single cycle optimization.  */
6961       if (!ok)
6962         {
6963           if (lane_reduc_code_p)
6964             return false;
6965           else
6966             single_defuse_cycle = false;
6967         }
6968     }
6969   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6970
6971   /* If the reduction stmt is one of the patterns that have lane
6972      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6973   if ((ncopies > 1 && ! single_defuse_cycle)
6974       && lane_reduc_code_p)
6975     {
6976       if (dump_enabled_p ())
6977         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978                          "multi def-use cycle not possible for lane-reducing "
6979                          "reduction operation\n");
6980       return false;
6981     }
6982
6983   if (slp_node
6984       && !(!single_defuse_cycle
6985            && code != DOT_PROD_EXPR
6986            && code != WIDEN_SUM_EXPR
6987            && code != SAD_EXPR
6988            && reduction_type != FOLD_LEFT_REDUCTION))
6989     for (i = 0; i < op_type; i++)
6990       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6991         {
6992           if (dump_enabled_p ())
6993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6994                              "incompatible vector types for invariants\n");
6995           return false;
6996         }
6997
6998   if (slp_node)
6999     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7000   else
7001     vec_num = 1;
7002
7003   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7004                              reduction_type, ncopies, cost_vec);
7005   if (dump_enabled_p ()
7006       && reduction_type == FOLD_LEFT_REDUCTION)
7007     dump_printf_loc (MSG_NOTE, vect_location,
7008                      "using an in-order (fold-left) reduction.\n");
7009   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7010   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7011      reductions go through their own vectorizable_* routines.  */
7012   if (!single_defuse_cycle
7013       && code != DOT_PROD_EXPR
7014       && code != WIDEN_SUM_EXPR
7015       && code != SAD_EXPR
7016       && reduction_type != FOLD_LEFT_REDUCTION)
7017     {
7018       stmt_vec_info tem
7019         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7020       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7021         {
7022           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7023           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7024         }
7025       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7026       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7027     }
7028   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7029     {
7030       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7031       internal_fn cond_fn = get_conditional_internal_fn (code);
7032
7033       if (reduction_type != FOLD_LEFT_REDUCTION
7034           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7035           && (cond_fn == IFN_LAST
7036               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7037                                                   OPTIMIZE_FOR_SPEED)))
7038         {
7039           if (dump_enabled_p ())
7040             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041                              "can't operate on partial vectors because"
7042                              " no conditional operation is available.\n");
7043           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7044         }
7045       else if (reduction_type == FOLD_LEFT_REDUCTION
7046                && reduc_fn == IFN_LAST
7047                && !expand_vec_cond_expr_p (vectype_in,
7048                                            truth_type_for (vectype_in),
7049                                            SSA_NAME))
7050         {
7051           if (dump_enabled_p ())
7052             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7053                              "can't operate on partial vectors because"
7054                              " no conditional operation is available.\n");
7055           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7056         }
7057       else
7058         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7059                                vectype_in, NULL);
7060     }
7061   return true;
7062 }
7063
7064 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7065    value.  */
7066
7067 bool
7068 vect_transform_reduction (loop_vec_info loop_vinfo,
7069                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7070                           gimple **vec_stmt, slp_tree slp_node)
7071 {
7072   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7073   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7074   int i;
7075   int ncopies;
7076   int vec_num;
7077
7078   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7079   gcc_assert (reduc_info->is_reduc_info);
7080
7081   if (nested_in_vect_loop_p (loop, stmt_info))
7082     {
7083       loop = loop->inner;
7084       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7085     }
7086
7087   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7088   enum tree_code code = gimple_assign_rhs_code (stmt);
7089   int op_type = TREE_CODE_LENGTH (code);
7090
7091   /* Flatten RHS.  */
7092   tree ops[3];
7093   switch (get_gimple_rhs_class (code))
7094     {
7095     case GIMPLE_TERNARY_RHS:
7096       ops[2] = gimple_assign_rhs3 (stmt);
7097       /* Fall thru.  */
7098     case GIMPLE_BINARY_RHS:
7099       ops[0] = gimple_assign_rhs1 (stmt);
7100       ops[1] = gimple_assign_rhs2 (stmt);
7101       break;
7102     default:
7103       gcc_unreachable ();
7104     }
7105
7106   /* All uses but the last are expected to be defined in the loop.
7107      The last use is the reduction variable.  In case of nested cycle this
7108      assumption is not true: we use reduc_index to record the index of the
7109      reduction variable.  */
7110   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7111   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7112   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7113   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7114
7115   if (slp_node)
7116     {
7117       ncopies = 1;
7118       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7119     }
7120   else
7121     {
7122       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7123       vec_num = 1;
7124     }
7125
7126   internal_fn cond_fn = get_conditional_internal_fn (code);
7127   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7128   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7129
7130   /* Transform.  */
7131   tree new_temp = NULL_TREE;
7132   auto_vec<tree> vec_oprnds0;
7133   auto_vec<tree> vec_oprnds1;
7134   auto_vec<tree> vec_oprnds2;
7135   tree def0;
7136
7137   if (dump_enabled_p ())
7138     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7139
7140   /* FORNOW: Multiple types are not supported for condition.  */
7141   if (code == COND_EXPR)
7142     gcc_assert (ncopies == 1);
7143
7144   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7145
7146   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7147   if (reduction_type == FOLD_LEFT_REDUCTION)
7148     {
7149       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7150       return vectorize_fold_left_reduction
7151           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7152            reduc_fn, ops, vectype_in, reduc_index, masks);
7153     }
7154
7155   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7156   gcc_assert (single_defuse_cycle
7157               || code == DOT_PROD_EXPR
7158               || code == WIDEN_SUM_EXPR
7159               || code == SAD_EXPR);
7160
7161   /* Create the destination vector  */
7162   tree scalar_dest = gimple_assign_lhs (stmt);
7163   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7164
7165   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7166                      single_defuse_cycle && reduc_index == 0
7167                      ? NULL_TREE : ops[0], &vec_oprnds0,
7168                      single_defuse_cycle && reduc_index == 1
7169                      ? NULL_TREE : ops[1], &vec_oprnds1,
7170                      op_type == ternary_op
7171                      && !(single_defuse_cycle && reduc_index == 2)
7172                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7173   if (single_defuse_cycle)
7174     {
7175       gcc_assert (!slp_node);
7176       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7177                                      ops[reduc_index],
7178                                      reduc_index == 0 ? &vec_oprnds0
7179                                      : (reduc_index == 1 ? &vec_oprnds1
7180                                         : &vec_oprnds2));
7181     }
7182
7183   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7184     {
7185       gimple *new_stmt;
7186       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7187       if (masked_loop_p && !mask_by_cond_expr)
7188         {
7189           /* Make sure that the reduction accumulator is vop[0].  */
7190           if (reduc_index == 1)
7191             {
7192               gcc_assert (commutative_tree_code (code));
7193               std::swap (vop[0], vop[1]);
7194             }
7195           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7196                                           vectype_in, i);
7197           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7198                                                     vop[0], vop[1], vop[0]);
7199           new_temp = make_ssa_name (vec_dest, call);
7200           gimple_call_set_lhs (call, new_temp);
7201           gimple_call_set_nothrow (call, true);
7202           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7203           new_stmt = call;
7204         }
7205       else
7206         {
7207           if (op_type == ternary_op)
7208             vop[2] = vec_oprnds2[i];
7209
7210           if (masked_loop_p && mask_by_cond_expr)
7211             {
7212               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7213                                               vectype_in, i);
7214               build_vect_cond_expr (code, vop, mask, gsi);
7215             }
7216
7217           new_stmt = gimple_build_assign (vec_dest, code,
7218                                           vop[0], vop[1], vop[2]);
7219           new_temp = make_ssa_name (vec_dest, new_stmt);
7220           gimple_assign_set_lhs (new_stmt, new_temp);
7221           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7222         }
7223
7224       if (slp_node)
7225         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7226       else if (single_defuse_cycle
7227                && i < ncopies - 1)
7228         {
7229           if (reduc_index == 0)
7230             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7231           else if (reduc_index == 1)
7232             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7233           else if (reduc_index == 2)
7234             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7235         }
7236       else
7237         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7238     }
7239
7240   if (!slp_node)
7241     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7242
7243   return true;
7244 }
7245
7246 /* Transform phase of a cycle PHI.  */
7247
7248 bool
7249 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7250                           stmt_vec_info stmt_info, gimple **vec_stmt,
7251                           slp_tree slp_node, slp_instance slp_node_instance)
7252 {
7253   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7254   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7255   int i;
7256   int ncopies;
7257   int j;
7258   bool nested_cycle = false;
7259   int vec_num;
7260
7261   if (nested_in_vect_loop_p (loop, stmt_info))
7262     {
7263       loop = loop->inner;
7264       nested_cycle = true;
7265     }
7266
7267   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7268   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7269   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7270   gcc_assert (reduc_info->is_reduc_info);
7271
7272   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7273       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7274     /* Leave the scalar phi in place.  */
7275     return true;
7276
7277   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7278   /* For a nested cycle we do not fill the above.  */
7279   if (!vectype_in)
7280     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7281   gcc_assert (vectype_in);
7282
7283   if (slp_node)
7284     {
7285       /* The size vect_schedule_slp_instance computes is off for us.  */
7286       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7287                                       * SLP_TREE_LANES (slp_node), vectype_in);
7288       ncopies = 1;
7289     }
7290   else
7291     {
7292       vec_num = 1;
7293       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7294     }
7295
7296   /* Check whether we should use a single PHI node and accumulate
7297      vectors to one before the backedge.  */
7298   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7299     ncopies = 1;
7300
7301   /* Create the destination vector  */
7302   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7303   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7304                                                vectype_out);
7305
7306   /* Get the loop-entry arguments.  */
7307   tree vec_initial_def;
7308   auto_vec<tree> vec_initial_defs;
7309   if (slp_node)
7310     {
7311       vec_initial_defs.reserve (vec_num);
7312       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7313       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7314       tree neutral_op
7315         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7316                                         STMT_VINFO_REDUC_CODE (reduc_info),
7317                                         first != NULL);
7318       get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7319                                       &vec_initial_defs, vec_num,
7320                                       first != NULL, neutral_op);
7321     }
7322   else
7323     {
7324       /* Get at the scalar def before the loop, that defines the initial
7325          value of the reduction variable.  */
7326       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7327                                                 loop_preheader_edge (loop));
7328       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7329          and we can't use zero for induc_val, use initial_def.  Similarly
7330          for REDUC_MIN and initial_def larger than the base.  */
7331       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7332         {
7333           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7334           if (TREE_CODE (initial_def) == INTEGER_CST
7335               && !integer_zerop (induc_val)
7336               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7337                    && tree_int_cst_lt (initial_def, induc_val))
7338                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7339                       && tree_int_cst_lt (induc_val, initial_def))))
7340             {
7341               induc_val = initial_def;
7342               /* Communicate we used the initial_def to epilouge
7343                  generation.  */
7344               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7345             }
7346           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7347           vec_initial_defs.create (ncopies);
7348           for (i = 0; i < ncopies; ++i)
7349             vec_initial_defs.quick_push (vec_initial_def);
7350         }
7351       else if (nested_cycle)
7352         {
7353           /* Do not use an adjustment def as that case is not supported
7354              correctly if ncopies is not one.  */
7355           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7356                                          ncopies, initial_def,
7357                                          &vec_initial_defs);
7358         }
7359       else
7360         {
7361           tree adjustment_def = NULL_TREE;
7362           tree *adjustment_defp = &adjustment_def;
7363           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7364           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7365             adjustment_defp = NULL;
7366           vec_initial_def
7367             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7368                                              initial_def, adjustment_defp);
7369           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7370           vec_initial_defs.create (ncopies);
7371           for (i = 0; i < ncopies; ++i)
7372             vec_initial_defs.quick_push (vec_initial_def);
7373         }
7374     }
7375
7376   /* Generate the reduction PHIs upfront.  */
7377   for (i = 0; i < vec_num; i++)
7378     {
7379       tree vec_init_def = vec_initial_defs[i];
7380       for (j = 0; j < ncopies; j++)
7381         {
7382           /* Create the reduction-phi that defines the reduction
7383              operand.  */
7384           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7385
7386           /* Set the loop-entry arg of the reduction-phi.  */
7387           if (j != 0 && nested_cycle)
7388             vec_init_def = vec_initial_defs[j];
7389           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7390                        UNKNOWN_LOCATION);
7391
7392           /* The loop-latch arg is set in epilogue processing.  */
7393
7394           if (slp_node)
7395             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7396           else
7397             {
7398               if (j == 0)
7399                 *vec_stmt = new_phi;
7400               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7401             }
7402         }
7403     }
7404
7405   return true;
7406 }
7407
7408 /* Vectorizes LC PHIs.  */
7409
7410 bool
7411 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7412                      stmt_vec_info stmt_info, gimple **vec_stmt,
7413                      slp_tree slp_node)
7414 {
7415   if (!loop_vinfo
7416       || !is_a <gphi *> (stmt_info->stmt)
7417       || gimple_phi_num_args (stmt_info->stmt) != 1)
7418     return false;
7419
7420   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7421       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7422     return false;
7423
7424   if (!vec_stmt) /* transformation not required.  */
7425     {
7426       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7427       return true;
7428     }
7429
7430   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7431   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7432   basic_block bb = gimple_bb (stmt_info->stmt);
7433   edge e = single_pred_edge (bb);
7434   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7435   auto_vec<tree> vec_oprnds;
7436   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7437                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7438                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7439   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7440     {
7441       /* Create the vectorized LC PHI node.  */
7442       gphi *new_phi = create_phi_node (vec_dest, bb);
7443       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7444       if (slp_node)
7445         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7446       else
7447         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7448     }
7449   if (!slp_node)
7450     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7451
7452   return true;
7453 }
7454
7455
7456 /* Function vect_min_worthwhile_factor.
7457
7458    For a loop where we could vectorize the operation indicated by CODE,
7459    return the minimum vectorization factor that makes it worthwhile
7460    to use generic vectors.  */
7461 static unsigned int
7462 vect_min_worthwhile_factor (enum tree_code code)
7463 {
7464   switch (code)
7465     {
7466     case PLUS_EXPR:
7467     case MINUS_EXPR:
7468     case NEGATE_EXPR:
7469       return 4;
7470
7471     case BIT_AND_EXPR:
7472     case BIT_IOR_EXPR:
7473     case BIT_XOR_EXPR:
7474     case BIT_NOT_EXPR:
7475       return 2;
7476
7477     default:
7478       return INT_MAX;
7479     }
7480 }
7481
7482 /* Return true if VINFO indicates we are doing loop vectorization and if
7483    it is worth decomposing CODE operations into scalar operations for
7484    that loop's vectorization factor.  */
7485
7486 bool
7487 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7488 {
7489   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7490   unsigned HOST_WIDE_INT value;
7491   return (loop_vinfo
7492           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7493           && value >= vect_min_worthwhile_factor (code));
7494 }
7495
7496 /* Function vectorizable_induction
7497
7498    Check if STMT_INFO performs an induction computation that can be vectorized.
7499    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7500    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7501    Return true if STMT_INFO is vectorizable in this way.  */
7502
7503 bool
7504 vectorizable_induction (loop_vec_info loop_vinfo,
7505                         stmt_vec_info stmt_info,
7506                         gimple **vec_stmt, slp_tree slp_node,
7507                         stmt_vector_for_cost *cost_vec)
7508 {
7509   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7510   unsigned ncopies;
7511   bool nested_in_vect_loop = false;
7512   class loop *iv_loop;
7513   tree vec_def;
7514   edge pe = loop_preheader_edge (loop);
7515   basic_block new_bb;
7516   tree new_vec, vec_init, vec_step, t;
7517   tree new_name;
7518   gimple *new_stmt;
7519   gphi *induction_phi;
7520   tree induc_def, vec_dest;
7521   tree init_expr, step_expr;
7522   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7523   unsigned i;
7524   tree expr;
7525   gimple_seq stmts;
7526   gimple_stmt_iterator si;
7527
7528   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7529   if (!phi)
7530     return false;
7531
7532   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7533     return false;
7534
7535   /* Make sure it was recognized as induction computation.  */
7536   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7537     return false;
7538
7539   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7540   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7541
7542   if (slp_node)
7543     ncopies = 1;
7544   else
7545     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7546   gcc_assert (ncopies >= 1);
7547
7548   /* FORNOW. These restrictions should be relaxed.  */
7549   if (nested_in_vect_loop_p (loop, stmt_info))
7550     {
7551       imm_use_iterator imm_iter;
7552       use_operand_p use_p;
7553       gimple *exit_phi;
7554       edge latch_e;
7555       tree loop_arg;
7556
7557       if (ncopies > 1)
7558         {
7559           if (dump_enabled_p ())
7560             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7561                              "multiple types in nested loop.\n");
7562           return false;
7563         }
7564
7565       /* FORNOW: outer loop induction with SLP not supported.  */
7566       if (STMT_SLP_TYPE (stmt_info))
7567         return false;
7568
7569       exit_phi = NULL;
7570       latch_e = loop_latch_edge (loop->inner);
7571       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7572       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7573         {
7574           gimple *use_stmt = USE_STMT (use_p);
7575           if (is_gimple_debug (use_stmt))
7576             continue;
7577
7578           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7579             {
7580               exit_phi = use_stmt;
7581               break;
7582             }
7583         }
7584       if (exit_phi)
7585         {
7586           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7587           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7588                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7589             {
7590               if (dump_enabled_p ())
7591                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7592                                  "inner-loop induction only used outside "
7593                                  "of the outer vectorized loop.\n");
7594               return false;
7595             }
7596         }
7597
7598       nested_in_vect_loop = true;
7599       iv_loop = loop->inner;
7600     }
7601   else
7602     iv_loop = loop;
7603   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7604
7605   if (slp_node && !nunits.is_constant ())
7606     {
7607       /* The current SLP code creates the initial value element-by-element.  */
7608       if (dump_enabled_p ())
7609         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7610                          "SLP induction not supported for variable-length"
7611                          " vectors.\n");
7612       return false;
7613     }
7614
7615   if (!vec_stmt) /* transformation not required.  */
7616     {
7617       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7618       DUMP_VECT_SCOPE ("vectorizable_induction");
7619       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7620       return true;
7621     }
7622
7623   /* Transform.  */
7624
7625   /* Compute a vector variable, initialized with the first VF values of
7626      the induction variable.  E.g., for an iv with IV_PHI='X' and
7627      evolution S, for a vector of 4 units, we want to compute:
7628      [X, X + S, X + 2*S, X + 3*S].  */
7629
7630   if (dump_enabled_p ())
7631     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7632
7633   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7634   gcc_assert (step_expr != NULL_TREE);
7635   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7636
7637   pe = loop_preheader_edge (iv_loop);
7638   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7639                                      loop_preheader_edge (iv_loop));
7640
7641   stmts = NULL;
7642   if (!nested_in_vect_loop)
7643     {
7644       /* Convert the initial value to the IV update type.  */
7645       tree new_type = TREE_TYPE (step_expr);
7646       init_expr = gimple_convert (&stmts, new_type, init_expr);
7647
7648       /* If we are using the loop mask to "peel" for alignment then we need
7649          to adjust the start value here.  */
7650       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7651       if (skip_niters != NULL_TREE)
7652         {
7653           if (FLOAT_TYPE_P (vectype))
7654             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7655                                         skip_niters);
7656           else
7657             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7658           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7659                                          skip_niters, step_expr);
7660           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7661                                     init_expr, skip_step);
7662         }
7663     }
7664
7665   if (stmts)
7666     {
7667       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7668       gcc_assert (!new_bb);
7669     }
7670
7671   /* Find the first insertion point in the BB.  */
7672   basic_block bb = gimple_bb (phi);
7673   si = gsi_after_labels (bb);
7674
7675   /* For SLP induction we have to generate several IVs as for example
7676      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7677      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7678      [VF*S, VF*S, VF*S, VF*S] for all.  */
7679   if (slp_node)
7680     {
7681       /* Enforced above.  */
7682       unsigned int const_nunits = nunits.to_constant ();
7683
7684       /* Generate [VF*S, VF*S, ... ].  */
7685       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7686         {
7687           expr = build_int_cst (integer_type_node, vf);
7688           expr = fold_convert (TREE_TYPE (step_expr), expr);
7689         }
7690       else
7691         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7692       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7693                               expr, step_expr);
7694       if (! CONSTANT_CLASS_P (new_name))
7695         new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7696                                      TREE_TYPE (step_expr), NULL);
7697       new_vec = build_vector_from_val (step_vectype, new_name);
7698       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7699                                    new_vec, step_vectype, NULL);
7700
7701       /* Now generate the IVs.  */
7702       unsigned group_size = SLP_TREE_LANES (slp_node);
7703       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7704       unsigned elts = const_nunits * nvects;
7705       /* Compute the number of distinct IVs we need.  First reduce
7706          group_size if it is a multiple of const_nunits so we get
7707          one IV for a group_size of 4 but const_nunits 2.  */
7708       unsigned group_sizep = group_size;
7709       if (group_sizep % const_nunits == 0)
7710         group_sizep = group_sizep / const_nunits;
7711       unsigned nivs = least_common_multiple (group_sizep,
7712                                              const_nunits) / const_nunits;
7713       gcc_assert (elts % group_size == 0);
7714       tree elt = init_expr;
7715       unsigned ivn;
7716       for (ivn = 0; ivn < nivs; ++ivn)
7717         {
7718           tree_vector_builder elts (step_vectype, const_nunits, 1);
7719           stmts = NULL;
7720           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7721             {
7722               if (ivn*const_nunits + eltn >= group_size
7723                   && (ivn * const_nunits + eltn) % group_size == 0)
7724                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7725                                     elt, step_expr);
7726               elts.quick_push (elt);
7727             }
7728           vec_init = gimple_build_vector (&stmts, &elts);
7729           vec_init = gimple_convert (&stmts, vectype, vec_init);
7730           if (stmts)
7731             {
7732               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7733               gcc_assert (!new_bb);
7734             }
7735
7736           /* Create the induction-phi that defines the induction-operand.  */
7737           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7738           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7739           induc_def = PHI_RESULT (induction_phi);
7740
7741           /* Create the iv update inside the loop  */
7742           gimple_seq stmts = NULL;
7743           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7744           vec_def = gimple_build (&stmts,
7745                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7746           vec_def = gimple_convert (&stmts, vectype, vec_def);
7747           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7748
7749           /* Set the arguments of the phi node:  */
7750           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7751           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7752                        UNKNOWN_LOCATION);
7753
7754           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7755         }
7756       /* Fill up to the number of vectors we need for the whole group.  */
7757       nivs = least_common_multiple (group_size,
7758                                     const_nunits) / const_nunits;
7759       for (; ivn < nivs; ++ivn)
7760         SLP_TREE_VEC_STMTS (slp_node)
7761           .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7762
7763       /* Re-use IVs when we can.  */
7764       if (ivn < nvects)
7765         {
7766           unsigned vfp
7767             = least_common_multiple (group_size, const_nunits) / group_size;
7768           /* Generate [VF'*S, VF'*S, ... ].  */
7769           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7770             {
7771               expr = build_int_cst (integer_type_node, vfp);
7772               expr = fold_convert (TREE_TYPE (step_expr), expr);
7773             }
7774           else
7775             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7776           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7777                                   expr, step_expr);
7778           if (! CONSTANT_CLASS_P (new_name))
7779             new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7780                                          TREE_TYPE (step_expr), NULL);
7781           new_vec = build_vector_from_val (step_vectype, new_name);
7782           vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7783                                        step_vectype, NULL);
7784           for (; ivn < nvects; ++ivn)
7785             {
7786               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7787               tree def;
7788               if (gimple_code (iv) == GIMPLE_PHI)
7789                 def = gimple_phi_result (iv);
7790               else
7791                 def = gimple_assign_lhs (iv);
7792               gimple_seq stmts = NULL;
7793               def = gimple_convert (&stmts, step_vectype, def);
7794               def = gimple_build (&stmts,
7795                                   PLUS_EXPR, step_vectype, def, vec_step);
7796               def = gimple_convert (&stmts, vectype, def);
7797               if (gimple_code (iv) == GIMPLE_PHI)
7798                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7799               else
7800                 {
7801                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7802                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7803                 }
7804               SLP_TREE_VEC_STMTS (slp_node)
7805                 .quick_push (SSA_NAME_DEF_STMT (def));
7806             }
7807         }
7808
7809       return true;
7810     }
7811
7812   /* Create the vector that holds the initial_value of the induction.  */
7813   if (nested_in_vect_loop)
7814     {
7815       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7816          been created during vectorization of previous stmts.  We obtain it
7817          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7818       auto_vec<tree> vec_inits;
7819       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7820                                      init_expr, &vec_inits);
7821       vec_init = vec_inits[0];
7822       /* If the initial value is not of proper type, convert it.  */
7823       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7824         {
7825           new_stmt
7826             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7827                                                           vect_simple_var,
7828                                                           "vec_iv_"),
7829                                    VIEW_CONVERT_EXPR,
7830                                    build1 (VIEW_CONVERT_EXPR, vectype,
7831                                            vec_init));
7832           vec_init = gimple_assign_lhs (new_stmt);
7833           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7834                                                  new_stmt);
7835           gcc_assert (!new_bb);
7836         }
7837     }
7838   else
7839     {
7840       /* iv_loop is the loop to be vectorized. Create:
7841          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7842       stmts = NULL;
7843       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7844
7845       unsigned HOST_WIDE_INT const_nunits;
7846       if (nunits.is_constant (&const_nunits))
7847         {
7848           tree_vector_builder elts (step_vectype, const_nunits, 1);
7849           elts.quick_push (new_name);
7850           for (i = 1; i < const_nunits; i++)
7851             {
7852               /* Create: new_name_i = new_name + step_expr  */
7853               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7854                                        new_name, step_expr);
7855               elts.quick_push (new_name);
7856             }
7857           /* Create a vector from [new_name_0, new_name_1, ...,
7858              new_name_nunits-1]  */
7859           vec_init = gimple_build_vector (&stmts, &elts);
7860         }
7861       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7862         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7863         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7864                                  new_name, step_expr);
7865       else
7866         {
7867           /* Build:
7868                 [base, base, base, ...]
7869                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7870           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7871           gcc_assert (flag_associative_math);
7872           tree index = build_index_vector (step_vectype, 0, 1);
7873           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7874                                                         new_name);
7875           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7876                                                         step_expr);
7877           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7878           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7879                                    vec_init, step_vec);
7880           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7881                                    vec_init, base_vec);
7882         }
7883       vec_init = gimple_convert (&stmts, vectype, vec_init);
7884
7885       if (stmts)
7886         {
7887           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7888           gcc_assert (!new_bb);
7889         }
7890     }
7891
7892
7893   /* Create the vector that holds the step of the induction.  */
7894   if (nested_in_vect_loop)
7895     /* iv_loop is nested in the loop to be vectorized. Generate:
7896        vec_step = [S, S, S, S]  */
7897     new_name = step_expr;
7898   else
7899     {
7900       /* iv_loop is the loop to be vectorized. Generate:
7901           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7902       gimple_seq seq = NULL;
7903       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7904         {
7905           expr = build_int_cst (integer_type_node, vf);
7906           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7907         }
7908       else
7909         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7910       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7911                                expr, step_expr);
7912       if (seq)
7913         {
7914           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7915           gcc_assert (!new_bb);
7916         }
7917     }
7918
7919   t = unshare_expr (new_name);
7920   gcc_assert (CONSTANT_CLASS_P (new_name)
7921               || TREE_CODE (new_name) == SSA_NAME);
7922   new_vec = build_vector_from_val (step_vectype, t);
7923   vec_step = vect_init_vector (loop_vinfo, stmt_info,
7924                                new_vec, step_vectype, NULL);
7925
7926
7927   /* Create the following def-use cycle:
7928      loop prolog:
7929          vec_init = ...
7930          vec_step = ...
7931      loop:
7932          vec_iv = PHI <vec_init, vec_loop>
7933          ...
7934          STMT
7935          ...
7936          vec_loop = vec_iv + vec_step;  */
7937
7938   /* Create the induction-phi that defines the induction-operand.  */
7939   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7940   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7941   induc_def = PHI_RESULT (induction_phi);
7942
7943   /* Create the iv update inside the loop  */
7944   stmts = NULL;
7945   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7946   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7947   vec_def = gimple_convert (&stmts, vectype, vec_def);
7948   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7949   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7950
7951   /* Set the arguments of the phi node:  */
7952   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7953   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7954                UNKNOWN_LOCATION);
7955
7956   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7957   *vec_stmt = induction_phi;
7958
7959   /* In case that vectorization factor (VF) is bigger than the number
7960      of elements that we can fit in a vectype (nunits), we have to generate
7961      more than one vector stmt - i.e - we need to "unroll" the
7962      vector stmt by a factor VF/nunits.  For more details see documentation
7963      in vectorizable_operation.  */
7964
7965   if (ncopies > 1)
7966     {
7967       gimple_seq seq = NULL;
7968       /* FORNOW. This restriction should be relaxed.  */
7969       gcc_assert (!nested_in_vect_loop);
7970
7971       /* Create the vector that holds the step of the induction.  */
7972       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7973         {
7974           expr = build_int_cst (integer_type_node, nunits);
7975           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7976         }
7977       else
7978         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7979       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7980                                expr, step_expr);
7981       if (seq)
7982         {
7983           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7984           gcc_assert (!new_bb);
7985         }
7986
7987       t = unshare_expr (new_name);
7988       gcc_assert (CONSTANT_CLASS_P (new_name)
7989                   || TREE_CODE (new_name) == SSA_NAME);
7990       new_vec = build_vector_from_val (step_vectype, t);
7991       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7992                                    new_vec, step_vectype, NULL);
7993
7994       vec_def = induc_def;
7995       for (i = 1; i < ncopies; i++)
7996         {
7997           /* vec_i = vec_prev + vec_step  */
7998           gimple_seq stmts = NULL;
7999           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8000           vec_def = gimple_build (&stmts,
8001                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8002           vec_def = gimple_convert (&stmts, vectype, vec_def);
8003
8004           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8005           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8006           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8007         }
8008     }
8009
8010   if (dump_enabled_p ())
8011     dump_printf_loc (MSG_NOTE, vect_location,
8012                      "transform induction: created def-use cycle: %G%G",
8013                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8014
8015   return true;
8016 }
8017
8018 /* Function vectorizable_live_operation.
8019
8020    STMT_INFO computes a value that is used outside the loop.  Check if
8021    it can be supported.  */
8022
8023 bool
8024 vectorizable_live_operation (vec_info *vinfo,
8025                              stmt_vec_info stmt_info,
8026                              gimple_stmt_iterator *gsi,
8027                              slp_tree slp_node, slp_instance slp_node_instance,
8028                              int slp_index, bool vec_stmt_p,
8029                              stmt_vector_for_cost *cost_vec)
8030 {
8031   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8032   imm_use_iterator imm_iter;
8033   tree lhs, lhs_type, bitsize, vec_bitsize;
8034   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8035   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8036   int ncopies;
8037   gimple *use_stmt;
8038   auto_vec<tree> vec_oprnds;
8039   int vec_entry = 0;
8040   poly_uint64 vec_index = 0;
8041
8042   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8043
8044   /* If a stmt of a reduction is live, vectorize it via
8045      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8046      validity so just trigger the transform here.  */
8047   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8048     {
8049       if (!vec_stmt_p)
8050         return true;
8051       if (slp_node)
8052         {
8053           /* For reduction chains the meta-info is attached to
8054              the group leader.  */
8055           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8056             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8057           /* For SLP reductions we vectorize the epilogue for
8058              all involved stmts together.  */
8059           else if (slp_index != 0)
8060             return true;
8061           else
8062             /* For SLP reductions the meta-info is attached to
8063                the representative.  */
8064             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8065         }
8066       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8067       gcc_assert (reduc_info->is_reduc_info);
8068       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8069           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8070         return true;
8071       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8072                                         slp_node_instance);
8073       return true;
8074     }
8075
8076   /* If STMT is not relevant and it is a simple assignment and its inputs are
8077      invariant then it can remain in place, unvectorized.  The original last
8078      scalar value that it computes will be used.  */
8079   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8080     {
8081       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8082       if (dump_enabled_p ())
8083         dump_printf_loc (MSG_NOTE, vect_location,
8084                          "statement is simple and uses invariant.  Leaving in "
8085                          "place.\n");
8086       return true;
8087     }
8088
8089   if (slp_node)
8090     ncopies = 1;
8091   else
8092     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8093
8094   if (slp_node)
8095     {
8096       gcc_assert (slp_index >= 0);
8097
8098       /* Get the last occurrence of the scalar index from the concatenation of
8099          all the slp vectors. Calculate which slp vector it is and the index
8100          within.  */
8101       int num_scalar = SLP_TREE_LANES (slp_node);
8102       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8103       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8104
8105       /* Calculate which vector contains the result, and which lane of
8106          that vector we need.  */
8107       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8108         {
8109           if (dump_enabled_p ())
8110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8111                              "Cannot determine which vector holds the"
8112                              " final result.\n");
8113           return false;
8114         }
8115     }
8116
8117   if (!vec_stmt_p)
8118     {
8119       /* No transformation required.  */
8120       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8121         {
8122           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8123                                                OPTIMIZE_FOR_SPEED))
8124             {
8125               if (dump_enabled_p ())
8126                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8127                                  "can't operate on partial vectors "
8128                                  "because the target doesn't support extract "
8129                                  "last reduction.\n");
8130               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8131             }
8132           else if (slp_node)
8133             {
8134               if (dump_enabled_p ())
8135                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8136                                  "can't operate on partial vectors "
8137                                  "because an SLP statement is live after "
8138                                  "the loop.\n");
8139               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8140             }
8141           else if (ncopies > 1)
8142             {
8143               if (dump_enabled_p ())
8144                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8145                                  "can't operate on partial vectors "
8146                                  "because ncopies is greater than 1.\n");
8147               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8148             }
8149           else
8150             {
8151               gcc_assert (ncopies == 1 && !slp_node);
8152               vect_record_loop_mask (loop_vinfo,
8153                                      &LOOP_VINFO_MASKS (loop_vinfo),
8154                                      1, vectype, NULL);
8155             }
8156         }
8157       /* ???  Enable for loop costing as well.  */
8158       if (!loop_vinfo)
8159         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8160                           0, vect_epilogue);
8161       return true;
8162     }
8163
8164   /* Use the lhs of the original scalar statement.  */
8165   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8166   if (dump_enabled_p ())
8167     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8168                      "stmt %G", stmt);
8169
8170   lhs = gimple_get_lhs (stmt);
8171   lhs_type = TREE_TYPE (lhs);
8172
8173   bitsize = vector_element_bits_tree (vectype);
8174   vec_bitsize = TYPE_SIZE (vectype);
8175
8176   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8177   tree vec_lhs, bitstart;
8178   gimple *vec_stmt;
8179   if (slp_node)
8180     {
8181       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8182
8183       /* Get the correct slp vectorized stmt.  */
8184       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8185       vec_lhs = gimple_get_lhs (vec_stmt);
8186
8187       /* Get entry to use.  */
8188       bitstart = bitsize_int (vec_index);
8189       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8190     }
8191   else
8192     {
8193       /* For multiple copies, get the last copy.  */
8194       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8195       vec_lhs = gimple_get_lhs (vec_stmt);
8196
8197       /* Get the last lane in the vector.  */
8198       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8199     }
8200
8201   if (loop_vinfo)
8202     {
8203       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8204          requirement, insert one phi node for it.  It looks like:
8205            loop;
8206          BB:
8207            # lhs' = PHI <lhs>
8208          ==>
8209            loop;
8210          BB:
8211            # vec_lhs' = PHI <vec_lhs>
8212            new_tree = lane_extract <vec_lhs', ...>;
8213            lhs' = new_tree;  */
8214
8215       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8216       basic_block exit_bb = single_exit (loop)->dest;
8217       gcc_assert (single_pred_p (exit_bb));
8218
8219       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8220       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8221       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8222
8223       gimple_seq stmts = NULL;
8224       tree new_tree;
8225       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8226         {
8227           /* Emit:
8228
8229                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8230
8231              where VEC_LHS is the vectorized live-out result and MASK is
8232              the loop mask for the final iteration.  */
8233           gcc_assert (ncopies == 1 && !slp_node);
8234           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8235           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8236                                           1, vectype, 0);
8237           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8238                                           mask, vec_lhs_phi);
8239
8240           /* Convert the extracted vector element to the scalar type.  */
8241           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8242         }
8243       else
8244         {
8245           tree bftype = TREE_TYPE (vectype);
8246           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8247             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8248           new_tree = build3 (BIT_FIELD_REF, bftype,
8249                              vec_lhs_phi, bitsize, bitstart);
8250           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8251                                            &stmts, true, NULL_TREE);
8252         }
8253
8254       if (stmts)
8255         {
8256           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8257           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8258
8259           /* Remove existing phi from lhs and create one copy from new_tree.  */
8260           tree lhs_phi = NULL_TREE;
8261           gimple_stmt_iterator gsi;
8262           for (gsi = gsi_start_phis (exit_bb);
8263                !gsi_end_p (gsi); gsi_next (&gsi))
8264             {
8265               gimple *phi = gsi_stmt (gsi);
8266               if ((gimple_phi_arg_def (phi, 0) == lhs))
8267                 {
8268                   remove_phi_node (&gsi, false);
8269                   lhs_phi = gimple_phi_result (phi);
8270                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8271                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8272                   break;
8273                 }
8274             }
8275         }
8276
8277       /* Replace use of lhs with newly computed result.  If the use stmt is a
8278          single arg PHI, just replace all uses of PHI result.  It's necessary
8279          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8280       use_operand_p use_p;
8281       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8282         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8283             && !is_gimple_debug (use_stmt))
8284           {
8285             if (gimple_code (use_stmt) == GIMPLE_PHI
8286                 && gimple_phi_num_args (use_stmt) == 1)
8287               {
8288                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8289               }
8290             else
8291               {
8292                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8293                     SET_USE (use_p, new_tree);
8294               }
8295             update_stmt (use_stmt);
8296           }
8297     }
8298   else
8299     {
8300       /* For basic-block vectorization simply insert the lane-extraction.  */
8301       tree bftype = TREE_TYPE (vectype);
8302       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8303         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8304       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8305                               vec_lhs, bitsize, bitstart);
8306       gimple_seq stmts = NULL;
8307       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8308                                        &stmts, true, NULL_TREE);
8309
8310       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
8311
8312       /* Replace use of lhs with newly computed result.  If the use stmt is a
8313          single arg PHI, just replace all uses of PHI result.  It's necessary
8314          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8315       use_operand_p use_p;
8316       stmt_vec_info use_stmt_info;
8317       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8318         if (!is_gimple_debug (use_stmt)
8319             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8320                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8321           {
8322             /* ???  This can happen when the live lane ends up being
8323                used in a vector construction code-generated by an
8324                external SLP node (and code-generation for that already
8325                happened).  See gcc.dg/vect/bb-slp-47.c.
8326                Doing this is what would happen if that vector CTOR
8327                were not code-generated yet so it is not too bad.
8328                ???  In fact we'd likely want to avoid this situation
8329                in the first place.  */
8330             if (gimple_code (use_stmt) != GIMPLE_PHI
8331                 && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt))
8332               {
8333                 gcc_assert (is_gimple_assign (use_stmt)
8334                             && gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR);
8335                 if (dump_enabled_p ())
8336                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8337                                    "Using original scalar computation for "
8338                                    "live lane because use preceeds vector "
8339                                    "def\n");
8340                 continue;
8341               }
8342             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8343               SET_USE (use_p, new_tree);
8344             update_stmt (use_stmt);
8345           }
8346     }
8347
8348   return true;
8349 }
8350
8351 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8352
8353 static void
8354 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8355 {
8356   ssa_op_iter op_iter;
8357   imm_use_iterator imm_iter;
8358   def_operand_p def_p;
8359   gimple *ustmt;
8360
8361   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8362     {
8363       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8364         {
8365           basic_block bb;
8366
8367           if (!is_gimple_debug (ustmt))
8368             continue;
8369
8370           bb = gimple_bb (ustmt);
8371
8372           if (!flow_bb_inside_loop_p (loop, bb))
8373             {
8374               if (gimple_debug_bind_p (ustmt))
8375                 {
8376                   if (dump_enabled_p ())
8377                     dump_printf_loc (MSG_NOTE, vect_location,
8378                                      "killing debug use\n");
8379
8380                   gimple_debug_bind_reset_value (ustmt);
8381                   update_stmt (ustmt);
8382                 }
8383               else
8384                 gcc_unreachable ();
8385             }
8386         }
8387     }
8388 }
8389
8390 /* Given loop represented by LOOP_VINFO, return true if computation of
8391    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8392    otherwise.  */
8393
8394 static bool
8395 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8396 {
8397   /* Constant case.  */
8398   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8399     {
8400       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8401       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8402
8403       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8404       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8405       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8406         return true;
8407     }
8408
8409   widest_int max;
8410   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8411   /* Check the upper bound of loop niters.  */
8412   if (get_max_loop_iterations (loop, &max))
8413     {
8414       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8415       signop sgn = TYPE_SIGN (type);
8416       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8417       if (max < type_max)
8418         return true;
8419     }
8420   return false;
8421 }
8422
8423 /* Return a mask type with half the number of elements as OLD_TYPE,
8424    given that it should have mode NEW_MODE.  */
8425
8426 tree
8427 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8428 {
8429   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8430   return build_truth_vector_type_for_mode (nunits, new_mode);
8431 }
8432
8433 /* Return a mask type with twice as many elements as OLD_TYPE,
8434    given that it should have mode NEW_MODE.  */
8435
8436 tree
8437 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8438 {
8439   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8440   return build_truth_vector_type_for_mode (nunits, new_mode);
8441 }
8442
8443 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8444    contain a sequence of NVECTORS masks that each control a vector of type
8445    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8446    these vector masks with the vector version of SCALAR_MASK.  */
8447
8448 void
8449 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8450                        unsigned int nvectors, tree vectype, tree scalar_mask)
8451 {
8452   gcc_assert (nvectors != 0);
8453   if (masks->length () < nvectors)
8454     masks->safe_grow_cleared (nvectors, true);
8455   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8456   /* The number of scalars per iteration and the number of vectors are
8457      both compile-time constants.  */
8458   unsigned int nscalars_per_iter
8459     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8460                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8461
8462   if (scalar_mask)
8463     {
8464       scalar_cond_masked_key cond (scalar_mask, nvectors);
8465       loop_vinfo->scalar_cond_masked_set.add (cond);
8466     }
8467
8468   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8469     {
8470       rgm->max_nscalars_per_iter = nscalars_per_iter;
8471       rgm->type = truth_type_for (vectype);
8472       rgm->factor = 1;
8473     }
8474 }
8475
8476 /* Given a complete set of masks MASKS, extract mask number INDEX
8477    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8478    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8479
8480    See the comment above vec_loop_masks for more details about the mask
8481    arrangement.  */
8482
8483 tree
8484 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8485                     unsigned int nvectors, tree vectype, unsigned int index)
8486 {
8487   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8488   tree mask_type = rgm->type;
8489
8490   /* Populate the rgroup's mask array, if this is the first time we've
8491      used it.  */
8492   if (rgm->controls.is_empty ())
8493     {
8494       rgm->controls.safe_grow_cleared (nvectors, true);
8495       for (unsigned int i = 0; i < nvectors; ++i)
8496         {
8497           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8498           /* Provide a dummy definition until the real one is available.  */
8499           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8500           rgm->controls[i] = mask;
8501         }
8502     }
8503
8504   tree mask = rgm->controls[index];
8505   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8506                 TYPE_VECTOR_SUBPARTS (vectype)))
8507     {
8508       /* A loop mask for data type X can be reused for data type Y
8509          if X has N times more elements than Y and if Y's elements
8510          are N times bigger than X's.  In this case each sequence
8511          of N elements in the loop mask will be all-zero or all-one.
8512          We can then view-convert the mask so that each sequence of
8513          N elements is replaced by a single element.  */
8514       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8515                               TYPE_VECTOR_SUBPARTS (vectype)));
8516       gimple_seq seq = NULL;
8517       mask_type = truth_type_for (vectype);
8518       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8519       if (seq)
8520         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8521     }
8522   return mask;
8523 }
8524
8525 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8526    lengths for controlling an operation on VECTYPE.  The operation splits
8527    each element of VECTYPE into FACTOR separate subelements, measuring the
8528    length as a number of these subelements.  */
8529
8530 void
8531 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8532                       unsigned int nvectors, tree vectype, unsigned int factor)
8533 {
8534   gcc_assert (nvectors != 0);
8535   if (lens->length () < nvectors)
8536     lens->safe_grow_cleared (nvectors, true);
8537   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8538
8539   /* The number of scalars per iteration, scalar occupied bytes and
8540      the number of vectors are both compile-time constants.  */
8541   unsigned int nscalars_per_iter
8542     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8543                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8544
8545   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8546     {
8547       /* For now, we only support cases in which all loads and stores fall back
8548          to VnQI or none do.  */
8549       gcc_assert (!rgl->max_nscalars_per_iter
8550                   || (rgl->factor == 1 && factor == 1)
8551                   || (rgl->max_nscalars_per_iter * rgl->factor
8552                       == nscalars_per_iter * factor));
8553       rgl->max_nscalars_per_iter = nscalars_per_iter;
8554       rgl->type = vectype;
8555       rgl->factor = factor;
8556     }
8557 }
8558
8559 /* Given a complete set of length LENS, extract length number INDEX for an
8560    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
8561
8562 tree
8563 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8564                    unsigned int nvectors, unsigned int index)
8565 {
8566   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8567
8568   /* Populate the rgroup's len array, if this is the first time we've
8569      used it.  */
8570   if (rgl->controls.is_empty ())
8571     {
8572       rgl->controls.safe_grow_cleared (nvectors, true);
8573       for (unsigned int i = 0; i < nvectors; ++i)
8574         {
8575           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8576           gcc_assert (len_type != NULL_TREE);
8577           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8578
8579           /* Provide a dummy definition until the real one is available.  */
8580           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8581           rgl->controls[i] = len;
8582         }
8583     }
8584
8585   return rgl->controls[index];
8586 }
8587
8588 /* Scale profiling counters by estimation for LOOP which is vectorized
8589    by factor VF.  */
8590
8591 static void
8592 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8593 {
8594   edge preheader = loop_preheader_edge (loop);
8595   /* Reduce loop iterations by the vectorization factor.  */
8596   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8597   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8598
8599   if (freq_h.nonzero_p ())
8600     {
8601       profile_probability p;
8602
8603       /* Avoid dropping loop body profile counter to 0 because of zero count
8604          in loop's preheader.  */
8605       if (!(freq_e == profile_count::zero ()))
8606         freq_e = freq_e.force_nonzero ();
8607       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8608       scale_loop_frequencies (loop, p);
8609     }
8610
8611   edge exit_e = single_exit (loop);
8612   exit_e->probability = profile_probability::always ()
8613                                  .apply_scale (1, new_est_niter + 1);
8614
8615   edge exit_l = single_pred_edge (loop->latch);
8616   profile_probability prob = exit_l->probability;
8617   exit_l->probability = exit_e->probability.invert ();
8618   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8619     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8620 }
8621
8622 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8623    latch edge values originally defined by it.  */
8624
8625 static void
8626 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8627                                      stmt_vec_info def_stmt_info)
8628 {
8629   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8630   if (!def || TREE_CODE (def) != SSA_NAME)
8631     return;
8632   stmt_vec_info phi_info;
8633   imm_use_iterator iter;
8634   use_operand_p use_p;
8635   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8636     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8637       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8638           && (phi_info = loop_vinfo->lookup_stmt (phi))
8639           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8640           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8641           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8642         {
8643           loop_p loop = gimple_bb (phi)->loop_father;
8644           edge e = loop_latch_edge (loop);
8645           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8646             {
8647               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8648               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8649               gcc_assert (phi_defs.length () == latch_defs.length ());
8650               for (unsigned i = 0; i < phi_defs.length (); ++i)
8651                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8652                              gimple_get_lhs (latch_defs[i]), e,
8653                              gimple_phi_arg_location (phi, e->dest_idx));
8654             }
8655         }
8656 }
8657
8658 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8659    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8660    stmt_vec_info.  */
8661
8662 static void
8663 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8664                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8665 {
8666   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8667   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8668
8669   if (dump_enabled_p ())
8670     dump_printf_loc (MSG_NOTE, vect_location,
8671                      "------>vectorizing statement: %G", stmt_info->stmt);
8672
8673   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8674     vect_loop_kill_debug_uses (loop, stmt_info);
8675
8676   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8677       && !STMT_VINFO_LIVE_P (stmt_info))
8678     return;
8679
8680   if (STMT_VINFO_VECTYPE (stmt_info))
8681     {
8682       poly_uint64 nunits
8683         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8684       if (!STMT_SLP_TYPE (stmt_info)
8685           && maybe_ne (nunits, vf)
8686           && dump_enabled_p ())
8687         /* For SLP VF is set according to unrolling factor, and not
8688            to vector size, hence for SLP this print is not valid.  */
8689         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8690     }
8691
8692   /* Pure SLP statements have already been vectorized.  We still need
8693      to apply loop vectorization to hybrid SLP statements.  */
8694   if (PURE_SLP_STMT (stmt_info))
8695     return;
8696
8697   if (dump_enabled_p ())
8698     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8699
8700   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8701     *seen_store = stmt_info;
8702 }
8703
8704 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8705    in the hash_map with its corresponding values.  */
8706
8707 static tree
8708 find_in_mapping (tree t, void *context)
8709 {
8710   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8711
8712   tree *value = mapping->get (t);
8713   return value ? *value : t;
8714 }
8715
8716 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8717    original loop that has now been vectorized.
8718
8719    The inits of the data_references need to be advanced with the number of
8720    iterations of the main loop.  This has been computed in vect_do_peeling and
8721    is stored in parameter ADVANCE.  We first restore the data_references
8722    initial offset with the values recored in ORIG_DRS_INIT.
8723
8724    Since the loop_vec_info of this EPILOGUE was constructed for the original
8725    loop, its stmt_vec_infos all point to the original statements.  These need
8726    to be updated to point to their corresponding copies as well as the SSA_NAMES
8727    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8728
8729    The data_reference's connections also need to be updated.  Their
8730    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8731    stmt_vec_infos, their statements need to point to their corresponding copy,
8732    if they are gather loads or scatter stores then their reference needs to be
8733    updated to point to its corresponding copy and finally we set
8734    'base_misaligned' to false as we have already peeled for alignment in the
8735    prologue of the main loop.  */
8736
8737 static void
8738 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8739 {
8740   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8741   auto_vec<gimple *> stmt_worklist;
8742   hash_map<tree,tree> mapping;
8743   gimple *orig_stmt, *new_stmt;
8744   gimple_stmt_iterator epilogue_gsi;
8745   gphi_iterator epilogue_phi_gsi;
8746   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8747   basic_block *epilogue_bbs = get_loop_body (epilogue);
8748   unsigned i;
8749
8750   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8751
8752   /* Advance data_reference's with the number of iterations of the previous
8753      loop and its prologue.  */
8754   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8755
8756
8757   /* The EPILOGUE loop is a copy of the original loop so they share the same
8758      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8759      point to the copied statements.  We also create a mapping of all LHS' in
8760      the original loop and all the LHS' in the EPILOGUE and create worklists to
8761      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8762   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8763     {
8764       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8765            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8766         {
8767           new_stmt = epilogue_phi_gsi.phi ();
8768
8769           gcc_assert (gimple_uid (new_stmt) > 0);
8770           stmt_vinfo
8771             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8772
8773           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8774           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8775
8776           mapping.put (gimple_phi_result (orig_stmt),
8777                        gimple_phi_result (new_stmt));
8778           /* PHI nodes can not have patterns or related statements.  */
8779           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8780                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8781         }
8782
8783       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8784            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8785         {
8786           new_stmt = gsi_stmt (epilogue_gsi);
8787           if (is_gimple_debug (new_stmt))
8788             continue;
8789
8790           gcc_assert (gimple_uid (new_stmt) > 0);
8791           stmt_vinfo
8792             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8793
8794           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8795           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8796
8797           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8798             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8799
8800           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8801             {
8802               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8803               for (gimple_stmt_iterator gsi = gsi_start (seq);
8804                    !gsi_end_p (gsi); gsi_next (&gsi))
8805                 stmt_worklist.safe_push (gsi_stmt (gsi));
8806             }
8807
8808           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8809           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8810             {
8811               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8812               stmt_worklist.safe_push (stmt);
8813               /* Set BB such that the assert in
8814                 'get_initial_def_for_reduction' is able to determine that
8815                 the BB of the related stmt is inside this loop.  */
8816               gimple_set_bb (stmt,
8817                              gimple_bb (new_stmt));
8818               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8819               gcc_assert (related_vinfo == NULL
8820                           || related_vinfo == stmt_vinfo);
8821             }
8822         }
8823     }
8824
8825   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8826      using the original main loop and thus need to be updated to refer to the
8827      cloned variables used in the epilogue.  */
8828   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8829     {
8830       gimple *stmt = stmt_worklist[i];
8831       tree *new_op;
8832
8833       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8834         {
8835           tree op = gimple_op (stmt, j);
8836           if ((new_op = mapping.get(op)))
8837             gimple_set_op (stmt, j, *new_op);
8838           else
8839             {
8840               /* PR92429: The last argument of simplify_replace_tree disables
8841                  folding when replacing arguments.  This is required as
8842                  otherwise you might end up with different statements than the
8843                  ones analyzed in vect_loop_analyze, leading to different
8844                  vectorization.  */
8845               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8846                                           &find_in_mapping, &mapping, false);
8847               gimple_set_op (stmt, j, op);
8848             }
8849         }
8850     }
8851
8852   struct data_reference *dr;
8853   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8854   FOR_EACH_VEC_ELT (datarefs, i, dr)
8855     {
8856       orig_stmt = DR_STMT (dr);
8857       gcc_assert (gimple_uid (orig_stmt) > 0);
8858       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8859       /* Data references for gather loads and scatter stores do not use the
8860          updated offset we set using ADVANCE.  Instead we have to make sure the
8861          reference in the data references point to the corresponding copy of
8862          the original in the epilogue.  */
8863       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8864           == VMAT_GATHER_SCATTER)
8865         {
8866           DR_REF (dr)
8867             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8868                                      &find_in_mapping, &mapping);
8869           DR_BASE_ADDRESS (dr)
8870             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8871                                      &find_in_mapping, &mapping);
8872         }
8873       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8874       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8875       /* The vector size of the epilogue is smaller than that of the main loop
8876          so the alignment is either the same or lower. This means the dr will
8877          thus by definition be aligned.  */
8878       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8879     }
8880
8881   epilogue_vinfo->shared->datarefs_copy.release ();
8882   epilogue_vinfo->shared->save_datarefs ();
8883 }
8884
8885 /* Function vect_transform_loop.
8886
8887    The analysis phase has determined that the loop is vectorizable.
8888    Vectorize the loop - created vectorized stmts to replace the scalar
8889    stmts in the loop, and update the loop exit condition.
8890    Returns scalar epilogue loop if any.  */
8891
8892 class loop *
8893 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8894 {
8895   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8896   class loop *epilogue = NULL;
8897   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8898   int nbbs = loop->num_nodes;
8899   int i;
8900   tree niters_vector = NULL_TREE;
8901   tree step_vector = NULL_TREE;
8902   tree niters_vector_mult_vf = NULL_TREE;
8903   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8904   unsigned int lowest_vf = constant_lower_bound (vf);
8905   gimple *stmt;
8906   bool check_profitability = false;
8907   unsigned int th;
8908
8909   DUMP_VECT_SCOPE ("vec_transform_loop");
8910
8911   loop_vinfo->shared->check_datarefs ();
8912
8913   /* Use the more conservative vectorization threshold.  If the number
8914      of iterations is constant assume the cost check has been performed
8915      by our caller.  If the threshold makes all loops profitable that
8916      run at least the (estimated) vectorization factor number of times
8917      checking is pointless, too.  */
8918   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8919   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8920     {
8921       if (dump_enabled_p ())
8922         dump_printf_loc (MSG_NOTE, vect_location,
8923                          "Profitability threshold is %d loop iterations.\n",
8924                          th);
8925       check_profitability = true;
8926     }
8927
8928   /* Make sure there exists a single-predecessor exit bb.  Do this before
8929      versioning.   */
8930   edge e = single_exit (loop);
8931   if (! single_pred_p (e->dest))
8932     {
8933       split_loop_exit_edge (e, true);
8934       if (dump_enabled_p ())
8935         dump_printf (MSG_NOTE, "split exit edge\n");
8936     }
8937
8938   /* Version the loop first, if required, so the profitability check
8939      comes first.  */
8940
8941   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8942     {
8943       class loop *sloop
8944         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8945       sloop->force_vectorize = false;
8946       check_profitability = false;
8947     }
8948
8949   /* Make sure there exists a single-predecessor exit bb also on the
8950      scalar loop copy.  Do this after versioning but before peeling
8951      so CFG structure is fine for both scalar and if-converted loop
8952      to make slpeel_duplicate_current_defs_from_edges face matched
8953      loop closed PHI nodes on the exit.  */
8954   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8955     {
8956       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8957       if (! single_pred_p (e->dest))
8958         {
8959           split_loop_exit_edge (e, true);
8960           if (dump_enabled_p ())
8961             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8962         }
8963     }
8964
8965   tree niters = vect_build_loop_niters (loop_vinfo);
8966   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8967   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8968   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8969   tree advance;
8970   drs_init_vec orig_drs_init;
8971
8972   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8973                               &step_vector, &niters_vector_mult_vf, th,
8974                               check_profitability, niters_no_overflow,
8975                               &advance);
8976
8977   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8978       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8979     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8980                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8981
8982   if (niters_vector == NULL_TREE)
8983     {
8984       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8985           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
8986           && known_eq (lowest_vf, vf))
8987         {
8988           niters_vector
8989             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8990                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8991           step_vector = build_one_cst (TREE_TYPE (niters));
8992         }
8993       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
8994         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8995                                      &step_vector, niters_no_overflow);
8996       else
8997         /* vect_do_peeling subtracted the number of peeled prologue
8998            iterations from LOOP_VINFO_NITERS.  */
8999         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9000                                      &niters_vector, &step_vector,
9001                                      niters_no_overflow);
9002     }
9003
9004   /* 1) Make sure the loop header has exactly two entries
9005      2) Make sure we have a preheader basic block.  */
9006
9007   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9008
9009   split_edge (loop_preheader_edge (loop));
9010
9011   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9012       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
9013     /* This will deal with any possible peeling.  */
9014     vect_prepare_for_masked_peels (loop_vinfo);
9015
9016   /* Schedule the SLP instances first, then handle loop vectorization
9017      below.  */
9018   if (!loop_vinfo->slp_instances.is_empty ())
9019     {
9020       DUMP_VECT_SCOPE ("scheduling SLP instances");
9021       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9022     }
9023
9024   /* FORNOW: the vectorizer supports only loops which body consist
9025      of one basic block (header + empty latch). When the vectorizer will
9026      support more involved loop forms, the order by which the BBs are
9027      traversed need to be reconsidered.  */
9028
9029   for (i = 0; i < nbbs; i++)
9030     {
9031       basic_block bb = bbs[i];
9032       stmt_vec_info stmt_info;
9033
9034       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9035            gsi_next (&si))
9036         {
9037           gphi *phi = si.phi ();
9038           if (dump_enabled_p ())
9039             dump_printf_loc (MSG_NOTE, vect_location,
9040                              "------>vectorizing phi: %G", phi);
9041           stmt_info = loop_vinfo->lookup_stmt (phi);
9042           if (!stmt_info)
9043             continue;
9044
9045           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9046             vect_loop_kill_debug_uses (loop, stmt_info);
9047
9048           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9049               && !STMT_VINFO_LIVE_P (stmt_info))
9050             continue;
9051
9052           if (STMT_VINFO_VECTYPE (stmt_info)
9053               && (maybe_ne
9054                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9055               && dump_enabled_p ())
9056             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9057
9058           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9059                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9060                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9061                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9062                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9063               && ! PURE_SLP_STMT (stmt_info))
9064             {
9065               if (dump_enabled_p ())
9066                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9067               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9068             }
9069         }
9070
9071       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9072            gsi_next (&si))
9073         {
9074           gphi *phi = si.phi ();
9075           stmt_info = loop_vinfo->lookup_stmt (phi);
9076           if (!stmt_info)
9077             continue;
9078
9079           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9080               && !STMT_VINFO_LIVE_P (stmt_info))
9081             continue;
9082
9083           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9084                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9085                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9086                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9087                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9088               && ! PURE_SLP_STMT (stmt_info))
9089             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9090         }
9091
9092       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9093            !gsi_end_p (si);)
9094         {
9095           stmt = gsi_stmt (si);
9096           /* During vectorization remove existing clobber stmts.  */
9097           if (gimple_clobber_p (stmt))
9098             {
9099               unlink_stmt_vdef (stmt);
9100               gsi_remove (&si, true);
9101               release_defs (stmt);
9102             }
9103           else
9104             {
9105               /* Ignore vector stmts created in the outer loop.  */
9106               stmt_info = loop_vinfo->lookup_stmt (stmt);
9107
9108               /* vector stmts created in the outer-loop during vectorization of
9109                  stmts in an inner-loop may not have a stmt_info, and do not
9110                  need to be vectorized.  */
9111               stmt_vec_info seen_store = NULL;
9112               if (stmt_info)
9113                 {
9114                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9115                     {
9116                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9117                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9118                            !gsi_end_p (subsi); gsi_next (&subsi))
9119                         {
9120                           stmt_vec_info pat_stmt_info
9121                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9122                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9123                                                     &si, &seen_store);
9124                         }
9125                       stmt_vec_info pat_stmt_info
9126                         = STMT_VINFO_RELATED_STMT (stmt_info);
9127                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9128                                                 &seen_store);
9129                       maybe_set_vectorized_backedge_value (loop_vinfo,
9130                                                            pat_stmt_info);
9131                     }
9132                   else
9133                     {
9134                       vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9135                                                 &seen_store);
9136                       maybe_set_vectorized_backedge_value (loop_vinfo,
9137                                                            stmt_info);
9138                     }
9139                 }
9140               gsi_next (&si);
9141               if (seen_store)
9142                 {
9143                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9144                     /* Interleaving.  If IS_STORE is TRUE, the
9145                        vectorization of the interleaving chain was
9146                        completed - free all the stores in the chain.  */
9147                     vect_remove_stores (loop_vinfo,
9148                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9149                   else
9150                     /* Free the attached stmt_vec_info and remove the stmt.  */
9151                     loop_vinfo->remove_stmt (stmt_info);
9152                 }
9153             }
9154         }
9155
9156       /* Stub out scalar statements that must not survive vectorization.
9157          Doing this here helps with grouped statements, or statements that
9158          are involved in patterns.  */
9159       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9160            !gsi_end_p (gsi); gsi_next (&gsi))
9161         {
9162           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9163           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9164             {
9165               tree lhs = gimple_get_lhs (call);
9166               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9167                 {
9168                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9169                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9170                   gsi_replace (&gsi, new_stmt, true);
9171                 }
9172             }
9173         }
9174     }                           /* BBs in loop */
9175
9176   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9177      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9178   if (integer_onep (step_vector))
9179     niters_no_overflow = true;
9180   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9181                            niters_vector_mult_vf, !niters_no_overflow);
9182
9183   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9184   scale_profile_for_vect_loop (loop, assumed_vf);
9185
9186   /* True if the final iteration might not handle a full vector's
9187      worth of scalar iterations.  */
9188   bool final_iter_may_be_partial
9189     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9190   /* The minimum number of iterations performed by the epilogue.  This
9191      is 1 when peeling for gaps because we always need a final scalar
9192      iteration.  */
9193   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9194   /* +1 to convert latch counts to loop iteration counts,
9195      -min_epilogue_iters to remove iterations that cannot be performed
9196        by the vector code.  */
9197   int bias_for_lowest = 1 - min_epilogue_iters;
9198   int bias_for_assumed = bias_for_lowest;
9199   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9200   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9201     {
9202       /* When the amount of peeling is known at compile time, the first
9203          iteration will have exactly alignment_npeels active elements.
9204          In the worst case it will have at least one.  */
9205       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9206       bias_for_lowest += lowest_vf - min_first_active;
9207       bias_for_assumed += assumed_vf - min_first_active;
9208     }
9209   /* In these calculations the "- 1" converts loop iteration counts
9210      back to latch counts.  */
9211   if (loop->any_upper_bound)
9212     loop->nb_iterations_upper_bound
9213       = (final_iter_may_be_partial
9214          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9215                           lowest_vf) - 1
9216          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9217                            lowest_vf) - 1);
9218   if (loop->any_likely_upper_bound)
9219     loop->nb_iterations_likely_upper_bound
9220       = (final_iter_may_be_partial
9221          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9222                           + bias_for_lowest, lowest_vf) - 1
9223          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9224                            + bias_for_lowest, lowest_vf) - 1);
9225   if (loop->any_estimate)
9226     loop->nb_iterations_estimate
9227       = (final_iter_may_be_partial
9228          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9229                           assumed_vf) - 1
9230          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9231                            assumed_vf) - 1);
9232
9233   if (dump_enabled_p ())
9234     {
9235       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9236         {
9237           dump_printf_loc (MSG_NOTE, vect_location,
9238                            "LOOP VECTORIZED\n");
9239           if (loop->inner)
9240             dump_printf_loc (MSG_NOTE, vect_location,
9241                              "OUTER LOOP VECTORIZED\n");
9242           dump_printf (MSG_NOTE, "\n");
9243         }
9244       else
9245         dump_printf_loc (MSG_NOTE, vect_location,
9246                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9247                          GET_MODE_NAME (loop_vinfo->vector_mode));
9248     }
9249
9250   /* Loops vectorized with a variable factor won't benefit from
9251      unrolling/peeling.  */
9252   if (!vf.is_constant ())
9253     {
9254       loop->unroll = 1;
9255       if (dump_enabled_p ())
9256         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9257                          " variable-length vectorization factor\n");
9258     }
9259   /* Free SLP instances here because otherwise stmt reference counting
9260      won't work.  */
9261   slp_instance instance;
9262   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9263     vect_free_slp_instance (instance, true);
9264   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9265   /* Clear-up safelen field since its value is invalid after vectorization
9266      since vectorized loop can have loop-carried dependencies.  */
9267   loop->safelen = 0;
9268
9269   if (epilogue)
9270     {
9271       update_epilogue_loop_vinfo (epilogue, advance);
9272
9273       epilogue->simduid = loop->simduid;
9274       epilogue->force_vectorize = loop->force_vectorize;
9275       epilogue->dont_vectorize = false;
9276     }
9277
9278   return epilogue;
9279 }
9280
9281 /* The code below is trying to perform simple optimization - revert
9282    if-conversion for masked stores, i.e. if the mask of a store is zero
9283    do not perform it and all stored value producers also if possible.
9284    For example,
9285      for (i=0; i<n; i++)
9286        if (c[i])
9287         {
9288           p1[i] += 1;
9289           p2[i] = p3[i] +2;
9290         }
9291    this transformation will produce the following semi-hammock:
9292
9293    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9294      {
9295        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9296        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9297        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9298        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9299        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9300        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9301      }
9302 */
9303
9304 void
9305 optimize_mask_stores (class loop *loop)
9306 {
9307   basic_block *bbs = get_loop_body (loop);
9308   unsigned nbbs = loop->num_nodes;
9309   unsigned i;
9310   basic_block bb;
9311   class loop *bb_loop;
9312   gimple_stmt_iterator gsi;
9313   gimple *stmt;
9314   auto_vec<gimple *> worklist;
9315   auto_purge_vect_location sentinel;
9316
9317   vect_location = find_loop_location (loop);
9318   /* Pick up all masked stores in loop if any.  */
9319   for (i = 0; i < nbbs; i++)
9320     {
9321       bb = bbs[i];
9322       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9323            gsi_next (&gsi))
9324         {
9325           stmt = gsi_stmt (gsi);
9326           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9327             worklist.safe_push (stmt);
9328         }
9329     }
9330
9331   free (bbs);
9332   if (worklist.is_empty ())
9333     return;
9334
9335   /* Loop has masked stores.  */
9336   while (!worklist.is_empty ())
9337     {
9338       gimple *last, *last_store;
9339       edge e, efalse;
9340       tree mask;
9341       basic_block store_bb, join_bb;
9342       gimple_stmt_iterator gsi_to;
9343       tree vdef, new_vdef;
9344       gphi *phi;
9345       tree vectype;
9346       tree zero;
9347
9348       last = worklist.pop ();
9349       mask = gimple_call_arg (last, 2);
9350       bb = gimple_bb (last);
9351       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9352          the same loop as if_bb.  It could be different to LOOP when two
9353          level loop-nest is vectorized and mask_store belongs to the inner
9354          one.  */
9355       e = split_block (bb, last);
9356       bb_loop = bb->loop_father;
9357       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9358       join_bb = e->dest;
9359       store_bb = create_empty_bb (bb);
9360       add_bb_to_loop (store_bb, bb_loop);
9361       e->flags = EDGE_TRUE_VALUE;
9362       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9363       /* Put STORE_BB to likely part.  */
9364       efalse->probability = profile_probability::unlikely ();
9365       store_bb->count = efalse->count ();
9366       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9367       if (dom_info_available_p (CDI_DOMINATORS))
9368         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9369       if (dump_enabled_p ())
9370         dump_printf_loc (MSG_NOTE, vect_location,
9371                          "Create new block %d to sink mask stores.",
9372                          store_bb->index);
9373       /* Create vector comparison with boolean result.  */
9374       vectype = TREE_TYPE (mask);
9375       zero = build_zero_cst (vectype);
9376       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9377       gsi = gsi_last_bb (bb);
9378       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9379       /* Create new PHI node for vdef of the last masked store:
9380          .MEM_2 = VDEF <.MEM_1>
9381          will be converted to
9382          .MEM.3 = VDEF <.MEM_1>
9383          and new PHI node will be created in join bb
9384          .MEM_2 = PHI <.MEM_1, .MEM_3>
9385       */
9386       vdef = gimple_vdef (last);
9387       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9388       gimple_set_vdef (last, new_vdef);
9389       phi = create_phi_node (vdef, join_bb);
9390       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9391
9392       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9393       while (true)
9394         {
9395           gimple_stmt_iterator gsi_from;
9396           gimple *stmt1 = NULL;
9397
9398           /* Move masked store to STORE_BB.  */
9399           last_store = last;
9400           gsi = gsi_for_stmt (last);
9401           gsi_from = gsi;
9402           /* Shift GSI to the previous stmt for further traversal.  */
9403           gsi_prev (&gsi);
9404           gsi_to = gsi_start_bb (store_bb);
9405           gsi_move_before (&gsi_from, &gsi_to);
9406           /* Setup GSI_TO to the non-empty block start.  */
9407           gsi_to = gsi_start_bb (store_bb);
9408           if (dump_enabled_p ())
9409             dump_printf_loc (MSG_NOTE, vect_location,
9410                              "Move stmt to created bb\n%G", last);
9411           /* Move all stored value producers if possible.  */
9412           while (!gsi_end_p (gsi))
9413             {
9414               tree lhs;
9415               imm_use_iterator imm_iter;
9416               use_operand_p use_p;
9417               bool res;
9418
9419               /* Skip debug statements.  */
9420               if (is_gimple_debug (gsi_stmt (gsi)))
9421                 {
9422                   gsi_prev (&gsi);
9423                   continue;
9424                 }
9425               stmt1 = gsi_stmt (gsi);
9426               /* Do not consider statements writing to memory or having
9427                  volatile operand.  */
9428               if (gimple_vdef (stmt1)
9429                   || gimple_has_volatile_ops (stmt1))
9430                 break;
9431               gsi_from = gsi;
9432               gsi_prev (&gsi);
9433               lhs = gimple_get_lhs (stmt1);
9434               if (!lhs)
9435                 break;
9436
9437               /* LHS of vectorized stmt must be SSA_NAME.  */
9438               if (TREE_CODE (lhs) != SSA_NAME)
9439                 break;
9440
9441               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9442                 {
9443                   /* Remove dead scalar statement.  */
9444                   if (has_zero_uses (lhs))
9445                     {
9446                       gsi_remove (&gsi_from, true);
9447                       continue;
9448                     }
9449                 }
9450
9451               /* Check that LHS does not have uses outside of STORE_BB.  */
9452               res = true;
9453               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9454                 {
9455                   gimple *use_stmt;
9456                   use_stmt = USE_STMT (use_p);
9457                   if (is_gimple_debug (use_stmt))
9458                     continue;
9459                   if (gimple_bb (use_stmt) != store_bb)
9460                     {
9461                       res = false;
9462                       break;
9463                     }
9464                 }
9465               if (!res)
9466                 break;
9467
9468               if (gimple_vuse (stmt1)
9469                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9470                 break;
9471
9472               /* Can move STMT1 to STORE_BB.  */
9473               if (dump_enabled_p ())
9474                 dump_printf_loc (MSG_NOTE, vect_location,
9475                                  "Move stmt to created bb\n%G", stmt1);
9476               gsi_move_before (&gsi_from, &gsi_to);
9477               /* Shift GSI_TO for further insertion.  */
9478               gsi_prev (&gsi_to);
9479             }
9480           /* Put other masked stores with the same mask to STORE_BB.  */
9481           if (worklist.is_empty ()
9482               || gimple_call_arg (worklist.last (), 2) != mask
9483               || worklist.last () != stmt1)
9484             break;
9485           last = worklist.pop ();
9486         }
9487       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9488     }
9489 }
9490
9491 /* Decide whether it is possible to use a zero-based induction variable
9492    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9493    the value that the induction variable must be able to hold in order
9494    to ensure that the rgroups eventually have no active vector elements.
9495    Return -1 otherwise.  */
9496
9497 widest_int
9498 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9499 {
9500   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9501   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9502   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9503
9504   /* Calculate the value that the induction variable must be able
9505      to hit in order to ensure that we end the loop with an all-false mask.
9506      This involves adding the maximum number of inactive trailing scalar
9507      iterations.  */
9508   widest_int iv_limit = -1;
9509   if (max_loop_iterations (loop, &iv_limit))
9510     {
9511       if (niters_skip)
9512         {
9513           /* Add the maximum number of skipped iterations to the
9514              maximum iteration count.  */
9515           if (TREE_CODE (niters_skip) == INTEGER_CST)
9516             iv_limit += wi::to_widest (niters_skip);
9517           else
9518             iv_limit += max_vf - 1;
9519         }
9520       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9521         /* Make a conservatively-correct assumption.  */
9522         iv_limit += max_vf - 1;
9523
9524       /* IV_LIMIT is the maximum number of latch iterations, which is also
9525          the maximum in-range IV value.  Round this value down to the previous
9526          vector alignment boundary and then add an extra full iteration.  */
9527       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9528       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9529     }
9530   return iv_limit;
9531 }
9532
9533 /* For the given rgroup_controls RGC, check whether an induction variable
9534    would ever hit a value that produces a set of all-false masks or zero
9535    lengths before wrapping around.  Return true if it's possible to wrap
9536    around before hitting the desirable value, otherwise return false.  */
9537
9538 bool
9539 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9540 {
9541   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9542
9543   if (iv_limit == -1)
9544     return true;
9545
9546   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9547   unsigned int compare_precision = TYPE_PRECISION (compare_type);
9548   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9549
9550   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9551     return true;
9552
9553   return false;
9554 }