gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     if (STMT_VINFO_IN_PATTERN_P (first))
 670       {
 671         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672         while (next)
 673           {
 674             if (! STMT_VINFO_IN_PATTERN_P (next)
 675                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 676               break;
 677             next = REDUC_GROUP_NEXT_ELEMENT (next);
 678           }
 679         /* If not all stmt in the chain are patterns or if we failed
 680            to update STMT_VINFO_REDUC_IDX try to handle the chain
 681            without patterns.  */
 682         if (! next
 683             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 684           {
 685             vect_fixup_reduc_chain (first);
 686             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 687               = STMT_VINFO_RELATED_STMT (first);
 688           }
 689       }
 690 }
 691
 692 /* Function vect_get_loop_niters.
 693
 694    Determine how many iterations the loop is executed and place it
 695    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 696    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 697    niter information holds in ASSUMPTIONS.
 698
 699    Return the loop exit condition.  */
 700
 701
 702 static gcond *
 703 vect_get_loop_niters (class loop *loop, tree *assumptions,
 704                       tree *number_of_iterations, tree *number_of_iterationsm1)
 705 {
 706   edge exit = single_exit (loop);
 707   class tree_niter_desc niter_desc;
 708   tree niter_assumptions, niter, may_be_zero;
 709   gcond *cond = get_loop_exit_condition (loop);
 710
 711   *assumptions = boolean_true_node;
 712   *number_of_iterationsm1 = chrec_dont_know;
 713   *number_of_iterations = chrec_dont_know;
 714   DUMP_VECT_SCOPE ("get_loop_niters");
 715
 716   if (!exit)
 717     return cond;
 718
 719   may_be_zero = NULL_TREE;
 720   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 721       || chrec_contains_undetermined (niter_desc.niter))
 722     return cond;
 723
 724   niter_assumptions = niter_desc.assumptions;
 725   may_be_zero = niter_desc.may_be_zero;
 726   niter = niter_desc.niter;
 727
 728   if (may_be_zero && integer_zerop (may_be_zero))
 729     may_be_zero = NULL_TREE;
 730
 731   if (may_be_zero)
 732     {
 733       if (COMPARISON_CLASS_P (may_be_zero))
 734         {
 735           /* Try to combine may_be_zero with assumptions, this can simplify
 736              computation of niter expression.  */
 737           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 738             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 739                                              niter_assumptions,
 740                                              fold_build1 (TRUTH_NOT_EXPR,
 741                                                           boolean_type_node,
 742                                                           may_be_zero));
 743           else
 744             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 745                                  build_int_cst (TREE_TYPE (niter), 0),
 746                                  rewrite_to_non_trapping_overflow (niter));
 747
 748           may_be_zero = NULL_TREE;
 749         }
 750       else if (integer_nonzerop (may_be_zero))
 751         {
 752           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 753           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 754           return cond;
 755         }
 756       else
 757         return cond;
 758     }
 759
 760   *assumptions = niter_assumptions;
 761   *number_of_iterationsm1 = niter;
 762
 763   /* We want the number of loop header executions which is the number
 764      of latch executions plus one.
 765      ???  For UINT_MAX latch executions this number overflows to zero
 766      for loops like do { n++; } while (n != 0);  */
 767   if (niter && !chrec_contains_undetermined (niter))
 768     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 769                           build_int_cst (TREE_TYPE (niter), 1));
 770   *number_of_iterations = niter;
 771
 772   return cond;
 773 }
 774
 775 /* Function bb_in_loop_p
 776
 777    Used as predicate for dfs order traversal of the loop bbs.  */
 778
 779 static bool
 780 bb_in_loop_p (const_basic_block bb, const void *data)
 781 {
 782   const class loop *const loop = (const class loop *)data;
 783   if (flow_bb_inside_loop_p (loop, bb))
 784     return true;
 785   return false;
 786 }
 787
 788
 789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 790    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 791
 792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 793   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 794     loop (loop_in),
 795     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 796     num_itersm1 (NULL_TREE),
 797     num_iters (NULL_TREE),
 798     num_iters_unchanged (NULL_TREE),
 799     num_iters_assumptions (NULL_TREE),
 800     th (0),
 801     versioning_threshold (0),
 802     vectorization_factor (0),
 803     max_vectorization_factor (0),
 804     mask_skip_niters (NULL_TREE),
 805     rgroup_compare_type (NULL_TREE),
 806     simd_if_cond (NULL_TREE),
 807     unaligned_dr (NULL),
 808     peeling_for_alignment (0),
 809     ptr_mask (0),
 810     ivexpr_map (NULL),
 811     scan_map (NULL),
 812     slp_unrolling_factor (1),
 813     single_scalar_iteration_cost (0),
 814     vec_outside_cost (0),
 815     vec_inside_cost (0),
 816     vectorizable (false),
 817     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 818     using_partial_vectors_p (false),
 819     epil_using_partial_vectors_p (false),
 820     peeling_for_gaps (false),
 821     peeling_for_niter (false),
 822     no_data_dependencies (false),
 823     has_mask_store (false),
 824     scalar_loop_scaling (profile_probability::uninitialized ()),
 825     scalar_loop (NULL),
 826     orig_loop_info (NULL)
 827 {
 828   /* CHECKME: We want to visit all BBs before their successors (except for
 829      latch blocks, for which this assertion wouldn't hold).  In the simple
 830      case of the loop forms we allow, a dfs order of the BBs would the same
 831      as reversed postorder traversal, so we are safe.  */
 832
 833   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 834                                           bbs, loop->num_nodes, loop);
 835   gcc_assert (nbbs == loop->num_nodes);
 836
 837   for (unsigned int i = 0; i < nbbs; i++)
 838     {
 839       basic_block bb = bbs[i];
 840       gimple_stmt_iterator si;
 841
 842       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 843         {
 844           gimple *phi = gsi_stmt (si);
 845           gimple_set_uid (phi, 0);
 846           add_stmt (phi);
 847         }
 848
 849       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 850         {
 851           gimple *stmt = gsi_stmt (si);
 852           gimple_set_uid (stmt, 0);
 853           if (is_gimple_debug (stmt))
 854             continue;
 855           add_stmt (stmt);
 856           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 857              third argument is the #pragma omp simd if (x) condition, when 0,
 858              loop shouldn't be vectorized, when non-zero constant, it should
 859              be vectorized normally, otherwise versioned with vectorized loop
 860              done if the condition is non-zero at runtime.  */
 861           if (loop_in->simduid
 862               && is_gimple_call (stmt)
 863               && gimple_call_internal_p (stmt)
 864               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 865               && gimple_call_num_args (stmt) >= 3
 866               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 867               && (loop_in->simduid
 868                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 869             {
 870               tree arg = gimple_call_arg (stmt, 2);
 871               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 872                 simd_if_cond = arg;
 873               else
 874                 gcc_assert (integer_nonzerop (arg));
 875             }
 876         }
 877     }
 878
 879   epilogue_vinfos.create (6);
 880 }
 881
 882 /* Free all levels of rgroup CONTROLS.  */
 883
 884 void
 885 release_vec_loop_controls (vec<rgroup_controls> *controls)
 886 {
 887   rgroup_controls *rgc;
 888   unsigned int i;
 889   FOR_EACH_VEC_ELT (*controls, i, rgc)
 890     rgc->controls.release ();
 891   controls->release ();
 892 }
 893
 894 /* Free all memory used by the _loop_vec_info, as well as all the
 895    stmt_vec_info structs of all the stmts in the loop.  */
 896
 897 _loop_vec_info::~_loop_vec_info ()
 898 {
 899   free (bbs);
 900
 901   release_vec_loop_controls (&masks);
 902   release_vec_loop_controls (&lens);
 903   delete ivexpr_map;
 904   delete scan_map;
 905   epilogue_vinfos.release ();
 906
 907   loop->aux = NULL;
 908 }
 909
 910 /* Return an invariant or register for EXPR and emit necessary
 911    computations in the LOOP_VINFO loop preheader.  */
 912
 913 tree
 914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 915 {
 916   if (is_gimple_reg (expr)
 917       || is_gimple_min_invariant (expr))
 918     return expr;
 919
 920   if (! loop_vinfo->ivexpr_map)
 921     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 922   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 923   if (! cached)
 924     {
 925       gimple_seq stmts = NULL;
 926       cached = force_gimple_operand (unshare_expr (expr),
 927                                      &stmts, true, NULL_TREE);
 928       if (stmts)
 929         {
 930           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 931           gsi_insert_seq_on_edge_immediate (e, stmts);
 932         }
 933     }
 934   return cached;
 935 }
 936
 937 /* Return true if we can use CMP_TYPE as the comparison type to produce
 938    all masks required to mask LOOP_VINFO.  */
 939
 940 static bool
 941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 942 {
 943   rgroup_controls *rgm;
 944   unsigned int i;
 945   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 946     if (rgm->type != NULL_TREE
 947         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 948                                             cmp_type, rgm->type,
 949                                             OPTIMIZE_FOR_SPEED))
 950       return false;
 951   return true;
 952 }
 953
 954 /* Calculate the maximum number of scalars per iteration for every
 955    rgroup in LOOP_VINFO.  */
 956
 957 static unsigned int
 958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 959 {
 960   unsigned int res = 1;
 961   unsigned int i;
 962   rgroup_controls *rgm;
 963   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 964     res = MAX (res, rgm->max_nscalars_per_iter);
 965   return res;
 966 }
 967
 968 /* Calculate the minimum precision necessary to represent:
 969
 970       MAX_NITERS * FACTOR
 971
 972    as an unsigned integer, where MAX_NITERS is the maximum number of
 973    loop header iterations for the original scalar form of LOOP_VINFO.  */
 974
 975 static unsigned
 976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
 977 {
 978   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 979
 980   /* Get the maximum number of iterations that is representable
 981      in the counter type.  */
 982   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 983   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 984
 985   /* Get a more refined estimate for the number of iterations.  */
 986   widest_int max_back_edges;
 987   if (max_loop_iterations (loop, &max_back_edges))
 988     max_ni = wi::smin (max_ni, max_back_edges + 1);
 989
 990   /* Work out how many bits we need to represent the limit.  */
 991   return wi::min_precision (max_ni * factor, UNSIGNED);
 992 }
 993
 994 /* True if the loop needs peeling or partial vectors when vectorized.  */
 995
 996 static bool
 997 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
 998 {
 999   unsigned HOST_WIDE_INT const_vf;
1000   HOST_WIDE_INT max_niter
1001     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1002
1003   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1004   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1005     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1006                                           (loop_vinfo));
1007
1008   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1009       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1010     {
1011       /* Work out the (constant) number of iterations that need to be
1012          peeled for reasons other than niters.  */
1013       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1014       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1015         peel_niter += 1;
1016       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1017                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1018         return true;
1019     }
1020   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1021       /* ??? When peeling for gaps but not alignment, we could
1022          try to check whether the (variable) niters is known to be
1023          VF * N + 1.  That's something of a niche case though.  */
1024       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1025       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1026       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1027            < (unsigned) exact_log2 (const_vf))
1028           /* In case of versioning, check if the maximum number of
1029              iterations is greater than th.  If they are identical,
1030              the epilogue is unnecessary.  */
1031           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1032               || ((unsigned HOST_WIDE_INT) max_niter
1033                   > (th / const_vf) * const_vf))))
1034     return true;
1035
1036   return false;
1037 }
1038
1039 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1040    whether we can actually generate the masks required.  Return true if so,
1041    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1042
1043 static bool
1044 vect_verify_full_masking (loop_vec_info loop_vinfo)
1045 {
1046   unsigned int min_ni_width;
1047   unsigned int max_nscalars_per_iter
1048     = vect_get_max_nscalars_per_iter (loop_vinfo);
1049
1050   /* Use a normal loop if there are no statements that need masking.
1051      This only happens in rare degenerate cases: it means that the loop
1052      has no loads, no stores, and no live-out values.  */
1053   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1054     return false;
1055
1056   /* Work out how many bits we need to represent the limit.  */
1057   min_ni_width
1058     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1059
1060   /* Find a scalar mode for which WHILE_ULT is supported.  */
1061   opt_scalar_int_mode cmp_mode_iter;
1062   tree cmp_type = NULL_TREE;
1063   tree iv_type = NULL_TREE;
1064   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1065   unsigned int iv_precision = UINT_MAX;
1066
1067   if (iv_limit != -1)
1068     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1069                                       UNSIGNED);
1070
1071   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1072     {
1073       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1074       if (cmp_bits >= min_ni_width
1075           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1076         {
1077           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1078           if (this_type
1079               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1080             {
1081               /* Although we could stop as soon as we find a valid mode,
1082                  there are at least two reasons why that's not always the
1083                  best choice:
1084
1085                  - An IV that's Pmode or wider is more likely to be reusable
1086                    in address calculations than an IV that's narrower than
1087                    Pmode.
1088
1089                  - Doing the comparison in IV_PRECISION or wider allows
1090                    a natural 0-based IV, whereas using a narrower comparison
1091                    type requires mitigations against wrap-around.
1092
1093                  Conversely, if the IV limit is variable, doing the comparison
1094                  in a wider type than the original type can introduce
1095                  unnecessary extensions, so picking the widest valid mode
1096                  is not always a good choice either.
1097
1098                  Here we prefer the first IV type that's Pmode or wider,
1099                  and the first comparison type that's IV_PRECISION or wider.
1100                  (The comparison type must be no wider than the IV type,
1101                  to avoid extensions in the vector loop.)
1102
1103                  ??? We might want to try continuing beyond Pmode for ILP32
1104                  targets if CMP_BITS < IV_PRECISION.  */
1105               iv_type = this_type;
1106               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1107                 cmp_type = this_type;
1108               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1109                 break;
1110             }
1111         }
1112     }
1113
1114   if (!cmp_type)
1115     return false;
1116
1117   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1118   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1119   return true;
1120 }
1121
1122 /* Check whether we can use vector access with length based on precison
1123    comparison.  So far, to keep it simple, we only allow the case that the
1124    precision of the target supported length is larger than the precision
1125    required by loop niters.  */
1126
1127 static bool
1128 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1129 {
1130   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1131     return false;
1132
1133   unsigned int max_nitems_per_iter = 1;
1134   unsigned int i;
1135   rgroup_controls *rgl;
1136   /* Find the maximum number of items per iteration for every rgroup.  */
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1138     {
1139       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1140       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1141     }
1142
1143   /* Work out how many bits we need to represent the length limit.  */
1144   unsigned int min_ni_prec
1145     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1146
1147   /* Now use the maximum of below precisions for one suitable IV type:
1148      - the IV's natural precision
1149      - the precision needed to hold: the maximum number of scalar
1150        iterations multiplied by the scale factor (min_ni_prec above)
1151      - the Pmode precision
1152
1153      If min_ni_prec is less than the precision of the current niters,
1154      we perfer to still use the niters type.  Prefer to use Pmode and
1155      wider IV to avoid narrow conversions.  */
1156
1157   unsigned int ni_prec
1158     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1159   min_ni_prec = MAX (min_ni_prec, ni_prec);
1160   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1161
1162   tree iv_type = NULL_TREE;
1163   opt_scalar_int_mode tmode_iter;
1164   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1165     {
1166       scalar_mode tmode = tmode_iter.require ();
1167       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1168
1169       /* ??? Do we really want to construct one IV whose precision exceeds
1170          BITS_PER_WORD?  */
1171       if (tbits > BITS_PER_WORD)
1172         break;
1173
1174       /* Find the first available standard integral type.  */
1175       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1176         {
1177           iv_type = build_nonstandard_integer_type (tbits, true);
1178           break;
1179         }
1180     }
1181
1182   if (!iv_type)
1183     {
1184       if (dump_enabled_p ())
1185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                          "can't vectorize with length-based partial vectors"
1187                          " because there is no suitable iv type.\n");
1188       return false;
1189     }
1190
1191   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1192   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1193
1194   return true;
1195 }
1196
1197 /* Calculate the cost of one scalar iteration of the loop.  */
1198 static void
1199 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1200 {
1201   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1202   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1203   int nbbs = loop->num_nodes, factor;
1204   int innerloop_iters, i;
1205
1206   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1207
1208   /* Gather costs for statements in the scalar loop.  */
1209
1210   /* FORNOW.  */
1211   innerloop_iters = 1;
1212   if (loop->inner)
1213     innerloop_iters = 50; /* FIXME */
1214
1215   for (i = 0; i < nbbs; i++)
1216     {
1217       gimple_stmt_iterator si;
1218       basic_block bb = bbs[i];
1219
1220       if (bb->loop_father == loop->inner)
1221         factor = innerloop_iters;
1222       else
1223         factor = 1;
1224
1225       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1226         {
1227           gimple *stmt = gsi_stmt (si);
1228           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1229
1230           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1231             continue;
1232
1233           /* Skip stmts that are not vectorized inside the loop.  */
1234           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1235           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1236               && (!STMT_VINFO_LIVE_P (vstmt_info)
1237                   || !VECTORIZABLE_CYCLE_DEF
1238                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1239             continue;
1240
1241           vect_cost_for_stmt kind;
1242           if (STMT_VINFO_DATA_REF (stmt_info))
1243             {
1244               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1245                kind = scalar_load;
1246              else
1247                kind = scalar_store;
1248             }
1249           else if (vect_nop_conversion_p (stmt_info))
1250             continue;
1251           else
1252             kind = scalar_stmt;
1253
1254           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1255                             factor, kind, stmt_info, 0, vect_prologue);
1256         }
1257     }
1258
1259   /* Now accumulate cost.  */
1260   void *target_cost_data = init_cost (loop);
1261   stmt_info_for_cost *si;
1262   int j;
1263   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1264                     j, si)
1265     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1266                           si->kind, si->stmt_info, si->vectype,
1267                           si->misalign, vect_body);
1268   unsigned dummy, body_cost = 0;
1269   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1270   destroy_cost_data (target_cost_data);
1271   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1272 }
1273
1274
1275 /* Function vect_analyze_loop_form_1.
1276
1277    Verify that certain CFG restrictions hold, including:
1278    - the loop has a pre-header
1279    - the loop has a single entry and exit
1280    - the loop exit condition is simple enough
1281    - the number of iterations can be analyzed, i.e, a countable loop.  The
1282      niter could be analyzed under some assumptions.  */
1283
1284 opt_result
1285 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1286                           tree *assumptions, tree *number_of_iterationsm1,
1287                           tree *number_of_iterations, gcond **inner_loop_cond)
1288 {
1289   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1290
1291   /* Different restrictions apply when we are considering an inner-most loop,
1292      vs. an outer (nested) loop.
1293      (FORNOW. May want to relax some of these restrictions in the future).  */
1294
1295   if (!loop->inner)
1296     {
1297       /* Inner-most loop.  We currently require that the number of BBs is
1298          exactly 2 (the header and latch).  Vectorizable inner-most loops
1299          look like this:
1300
1301                         (pre-header)
1302                            |
1303                           header <--------+
1304                            | |            |
1305                            | +--> latch --+
1306                            |
1307                         (exit-bb)  */
1308
1309       if (loop->num_nodes != 2)
1310         return opt_result::failure_at (vect_location,
1311                                        "not vectorized:"
1312                                        " control flow in loop.\n");
1313
1314       if (empty_block_p (loop->header))
1315         return opt_result::failure_at (vect_location,
1316                                        "not vectorized: empty loop.\n");
1317     }
1318   else
1319     {
1320       class loop *innerloop = loop->inner;
1321       edge entryedge;
1322
1323       /* Nested loop. We currently require that the loop is doubly-nested,
1324          contains a single inner loop, and the number of BBs is exactly 5.
1325          Vectorizable outer-loops look like this:
1326
1327                         (pre-header)
1328                            |
1329                           header <---+
1330                            |         |
1331                           inner-loop |
1332                            |         |
1333                           tail ------+
1334                            |
1335                         (exit-bb)
1336
1337          The inner-loop has the properties expected of inner-most loops
1338          as described above.  */
1339
1340       if ((loop->inner)->inner || (loop->inner)->next)
1341         return opt_result::failure_at (vect_location,
1342                                        "not vectorized:"
1343                                        " multiple nested loops.\n");
1344
1345       if (loop->num_nodes != 5)
1346         return opt_result::failure_at (vect_location,
1347                                        "not vectorized:"
1348                                        " control flow in loop.\n");
1349
1350       entryedge = loop_preheader_edge (innerloop);
1351       if (entryedge->src != loop->header
1352           || !single_exit (innerloop)
1353           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1354         return opt_result::failure_at (vect_location,
1355                                        "not vectorized:"
1356                                        " unsupported outerloop form.\n");
1357
1358       /* Analyze the inner-loop.  */
1359       tree inner_niterm1, inner_niter, inner_assumptions;
1360       opt_result res
1361         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1362                                     &inner_assumptions, &inner_niterm1,
1363                                     &inner_niter, NULL);
1364       if (!res)
1365         {
1366           if (dump_enabled_p ())
1367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368                              "not vectorized: Bad inner loop.\n");
1369           return res;
1370         }
1371
1372       /* Don't support analyzing niter under assumptions for inner
1373          loop.  */
1374       if (!integer_onep (inner_assumptions))
1375         return opt_result::failure_at (vect_location,
1376                                        "not vectorized: Bad inner loop.\n");
1377
1378       if (!expr_invariant_in_loop_p (loop, inner_niter))
1379         return opt_result::failure_at (vect_location,
1380                                        "not vectorized: inner-loop count not"
1381                                        " invariant.\n");
1382
1383       if (dump_enabled_p ())
1384         dump_printf_loc (MSG_NOTE, vect_location,
1385                          "Considering outer-loop vectorization.\n");
1386     }
1387
1388   if (!single_exit (loop))
1389     return opt_result::failure_at (vect_location,
1390                                    "not vectorized: multiple exits.\n");
1391   if (EDGE_COUNT (loop->header->preds) != 2)
1392     return opt_result::failure_at (vect_location,
1393                                    "not vectorized:"
1394                                    " too many incoming edges.\n");
1395
1396   /* We assume that the loop exit condition is at the end of the loop. i.e,
1397      that the loop is represented as a do-while (with a proper if-guard
1398      before the loop if needed), where the loop header contains all the
1399      executable statements, and the latch is empty.  */
1400   if (!empty_block_p (loop->latch)
1401       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1402     return opt_result::failure_at (vect_location,
1403                                    "not vectorized: latch block not empty.\n");
1404
1405   /* Make sure the exit is not abnormal.  */
1406   edge e = single_exit (loop);
1407   if (e->flags & EDGE_ABNORMAL)
1408     return opt_result::failure_at (vect_location,
1409                                    "not vectorized:"
1410                                    " abnormal loop exit edge.\n");
1411
1412   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1413                                      number_of_iterationsm1);
1414   if (!*loop_cond)
1415     return opt_result::failure_at
1416       (vect_location,
1417        "not vectorized: complicated exit condition.\n");
1418
1419   if (integer_zerop (*assumptions)
1420       || !*number_of_iterations
1421       || chrec_contains_undetermined (*number_of_iterations))
1422     return opt_result::failure_at
1423       (*loop_cond,
1424        "not vectorized: number of iterations cannot be computed.\n");
1425
1426   if (integer_zerop (*number_of_iterations))
1427     return opt_result::failure_at
1428       (*loop_cond,
1429        "not vectorized: number of iterations = 0.\n");
1430
1431   return opt_result::success ();
1432 }
1433
1434 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1435
1436 opt_loop_vec_info
1437 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1438 {
1439   tree assumptions, number_of_iterations, number_of_iterationsm1;
1440   gcond *loop_cond, *inner_loop_cond = NULL;
1441
1442   opt_result res
1443     = vect_analyze_loop_form_1 (loop, &loop_cond,
1444                                 &assumptions, &number_of_iterationsm1,
1445                                 &number_of_iterations, &inner_loop_cond);
1446   if (!res)
1447     return opt_loop_vec_info::propagate_failure (res);
1448
1449   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1450   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1451   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1452   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1453   if (!integer_onep (assumptions))
1454     {
1455       /* We consider to vectorize this loop by versioning it under
1456          some assumptions.  In order to do this, we need to clear
1457          existing information computed by scev and niter analyzer.  */
1458       scev_reset_htab ();
1459       free_numbers_of_iterations_estimates (loop);
1460       /* Also set flag for this loop so that following scev and niter
1461          analysis are done under the assumptions.  */
1462       loop_constraint_set (loop, LOOP_C_FINITE);
1463       /* Also record the assumptions for versioning.  */
1464       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1465     }
1466
1467   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1468     {
1469       if (dump_enabled_p ())
1470         {
1471           dump_printf_loc (MSG_NOTE, vect_location,
1472                            "Symbolic number of iterations is ");
1473           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1474           dump_printf (MSG_NOTE, "\n");
1475         }
1476     }
1477
1478   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1479   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1480   if (inner_loop_cond)
1481     {
1482       stmt_vec_info inner_loop_cond_info
1483         = loop_vinfo->lookup_stmt (inner_loop_cond);
1484       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1485     }
1486
1487   gcc_assert (!loop->aux);
1488   loop->aux = loop_vinfo;
1489   return opt_loop_vec_info::success (loop_vinfo);
1490 }
1491
1492
1493
1494 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1495    statements update the vectorization factor.  */
1496
1497 static void
1498 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1499 {
1500   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1501   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1502   int nbbs = loop->num_nodes;
1503   poly_uint64 vectorization_factor;
1504   int i;
1505
1506   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1507
1508   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1509   gcc_assert (known_ne (vectorization_factor, 0U));
1510
1511   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1512      vectorization factor of the loop is the unrolling factor required by
1513      the SLP instances.  If that unrolling factor is 1, we say, that we
1514      perform pure SLP on loop - cross iteration parallelism is not
1515      exploited.  */
1516   bool only_slp_in_loop = true;
1517   for (i = 0; i < nbbs; i++)
1518     {
1519       basic_block bb = bbs[i];
1520       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1521            gsi_next (&si))
1522         {
1523           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1524           if (!stmt_info)
1525             continue;
1526           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1527                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1528               && !PURE_SLP_STMT (stmt_info))
1529             /* STMT needs both SLP and loop-based vectorization.  */
1530             only_slp_in_loop = false;
1531         }
1532       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1533            gsi_next (&si))
1534         {
1535           if (is_gimple_debug (gsi_stmt (si)))
1536             continue;
1537           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1538           stmt_info = vect_stmt_to_vectorize (stmt_info);
1539           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1540                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1541               && !PURE_SLP_STMT (stmt_info))
1542             /* STMT needs both SLP and loop-based vectorization.  */
1543             only_slp_in_loop = false;
1544         }
1545     }
1546
1547   if (only_slp_in_loop)
1548     {
1549       if (dump_enabled_p ())
1550         dump_printf_loc (MSG_NOTE, vect_location,
1551                          "Loop contains only SLP stmts\n");
1552       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1553     }
1554   else
1555     {
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_NOTE, vect_location,
1558                          "Loop contains SLP and non-SLP stmts\n");
1559       /* Both the vectorization factor and unroll factor have the form
1560          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1561          so they must have a common multiple.  */
1562       vectorization_factor
1563         = force_common_multiple (vectorization_factor,
1564                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1565     }
1566
1567   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1568   if (dump_enabled_p ())
1569     {
1570       dump_printf_loc (MSG_NOTE, vect_location,
1571                        "Updating vectorization factor to ");
1572       dump_dec (MSG_NOTE, vectorization_factor);
1573       dump_printf (MSG_NOTE, ".\n");
1574     }
1575 }
1576
1577 /* Return true if STMT_INFO describes a double reduction phi and if
1578    the other phi in the reduction is also relevant for vectorization.
1579    This rejects cases such as:
1580
1581       outer1:
1582         x_1 = PHI <x_3(outer2), ...>;
1583         ...
1584
1585       inner:
1586         x_2 = ...;
1587         ...
1588
1589       outer2:
1590         x_3 = PHI <x_2(inner)>;
1591
1592    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1593
1594 static bool
1595 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1596 {
1597   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1598     return false;
1599
1600   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1601 }
1602
1603 /* Function vect_analyze_loop_operations.
1604
1605    Scan the loop stmts and make sure they are all vectorizable.  */
1606
1607 static opt_result
1608 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1609 {
1610   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1611   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1612   int nbbs = loop->num_nodes;
1613   int i;
1614   stmt_vec_info stmt_info;
1615   bool need_to_vectorize = false;
1616   bool ok;
1617
1618   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1619
1620   auto_vec<stmt_info_for_cost> cost_vec;
1621
1622   for (i = 0; i < nbbs; i++)
1623     {
1624       basic_block bb = bbs[i];
1625
1626       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1627            gsi_next (&si))
1628         {
1629           gphi *phi = si.phi ();
1630           ok = true;
1631
1632           stmt_info = loop_vinfo->lookup_stmt (phi);
1633           if (dump_enabled_p ())
1634             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1635           if (virtual_operand_p (gimple_phi_result (phi)))
1636             continue;
1637
1638           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1639              (i.e., a phi in the tail of the outer-loop).  */
1640           if (! is_loop_header_bb_p (bb))
1641             {
1642               /* FORNOW: we currently don't support the case that these phis
1643                  are not used in the outerloop (unless it is double reduction,
1644                  i.e., this phi is vect_reduction_def), cause this case
1645                  requires to actually do something here.  */
1646               if (STMT_VINFO_LIVE_P (stmt_info)
1647                   && !vect_active_double_reduction_p (stmt_info))
1648                 return opt_result::failure_at (phi,
1649                                                "Unsupported loop-closed phi"
1650                                                " in outer-loop.\n");
1651
1652               /* If PHI is used in the outer loop, we check that its operand
1653                  is defined in the inner loop.  */
1654               if (STMT_VINFO_RELEVANT_P (stmt_info))
1655                 {
1656                   tree phi_op;
1657
1658                   if (gimple_phi_num_args (phi) != 1)
1659                     return opt_result::failure_at (phi, "unsupported phi");
1660
1661                   phi_op = PHI_ARG_DEF (phi, 0);
1662                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1663                   if (!op_def_info)
1664                     return opt_result::failure_at (phi, "unsupported phi\n");
1665
1666                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1667                       && (STMT_VINFO_RELEVANT (op_def_info)
1668                           != vect_used_in_outer_by_reduction))
1669                     return opt_result::failure_at (phi, "unsupported phi\n");
1670
1671                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1672                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1673                            == vect_double_reduction_def))
1674                       && !vectorizable_lc_phi (loop_vinfo,
1675                                                stmt_info, NULL, NULL))
1676                     return opt_result::failure_at (phi, "unsupported phi\n");
1677                 }
1678
1679               continue;
1680             }
1681
1682           gcc_assert (stmt_info);
1683
1684           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1685                || STMT_VINFO_LIVE_P (stmt_info))
1686               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1687             /* A scalar-dependence cycle that we don't support.  */
1688             return opt_result::failure_at (phi,
1689                                            "not vectorized:"
1690                                            " scalar dependence cycle.\n");
1691
1692           if (STMT_VINFO_RELEVANT_P (stmt_info))
1693             {
1694               need_to_vectorize = true;
1695               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1696                   && ! PURE_SLP_STMT (stmt_info))
1697                 ok = vectorizable_induction (loop_vinfo,
1698                                              stmt_info, NULL, NULL,
1699                                              &cost_vec);
1700               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1701                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1702                             == vect_double_reduction_def)
1703                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1704                        && ! PURE_SLP_STMT (stmt_info))
1705                 ok = vectorizable_reduction (loop_vinfo,
1706                                              stmt_info, NULL, NULL, &cost_vec);
1707             }
1708
1709           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1710           if (ok
1711               && STMT_VINFO_LIVE_P (stmt_info)
1712               && !PURE_SLP_STMT (stmt_info))
1713             ok = vectorizable_live_operation (loop_vinfo,
1714                                               stmt_info, NULL, NULL, NULL,
1715                                               -1, false, &cost_vec);
1716
1717           if (!ok)
1718             return opt_result::failure_at (phi,
1719                                            "not vectorized: relevant phi not "
1720                                            "supported: %G",
1721                                            static_cast <gimple *> (phi));
1722         }
1723
1724       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1725            gsi_next (&si))
1726         {
1727           gimple *stmt = gsi_stmt (si);
1728           if (!gimple_clobber_p (stmt)
1729               && !is_gimple_debug (stmt))
1730             {
1731               opt_result res
1732                 = vect_analyze_stmt (loop_vinfo,
1733                                      loop_vinfo->lookup_stmt (stmt),
1734                                      &need_to_vectorize,
1735                                      NULL, NULL, &cost_vec);
1736               if (!res)
1737                 return res;
1738             }
1739         }
1740     } /* bbs */
1741
1742   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1743
1744   /* All operations in the loop are either irrelevant (deal with loop
1745      control, or dead), or only used outside the loop and can be moved
1746      out of the loop (e.g. invariants, inductions).  The loop can be
1747      optimized away by scalar optimizations.  We're better off not
1748      touching this loop.  */
1749   if (!need_to_vectorize)
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_NOTE, vect_location,
1753                          "All the computation can be taken out of the loop.\n");
1754       return opt_result::failure_at
1755         (vect_location,
1756          "not vectorized: redundant loop. no profit to vectorize.\n");
1757     }
1758
1759   return opt_result::success ();
1760 }
1761
1762 /* Return true if we know that the iteration count is smaller than the
1763    vectorization factor.  Return false if it isn't, or if we can't be sure
1764    either way.  */
1765
1766 static bool
1767 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1768 {
1769   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1770
1771   HOST_WIDE_INT max_niter;
1772   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1773     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1774   else
1775     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1776
1777   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1778     return true;
1779
1780   return false;
1781 }
1782
1783 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1784    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1785    definitely no, or -1 if it's worth retrying.  */
1786
1787 static int
1788 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1789 {
1790   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1791   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1792
1793   /* Only loops that can handle partially-populated vectors can have iteration
1794      counts less than the vectorization factor.  */
1795   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1796     {
1797       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1798         {
1799           if (dump_enabled_p ())
1800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1801                              "not vectorized: iteration count smaller than "
1802                              "vectorization factor.\n");
1803           return 0;
1804         }
1805     }
1806
1807   int min_profitable_iters, min_profitable_estimate;
1808   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1809                                       &min_profitable_estimate);
1810
1811   if (min_profitable_iters < 0)
1812     {
1813       if (dump_enabled_p ())
1814         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815                          "not vectorized: vectorization not profitable.\n");
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818                          "not vectorized: vector version will never be "
1819                          "profitable.\n");
1820       return -1;
1821     }
1822
1823   int min_scalar_loop_bound = (param_min_vect_loop_bound
1824                                * assumed_vf);
1825
1826   /* Use the cost model only if it is more conservative than user specified
1827      threshold.  */
1828   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1829                                     min_profitable_iters);
1830
1831   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1832
1833   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1834       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1835     {
1836       if (dump_enabled_p ())
1837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                          "not vectorized: vectorization not profitable.\n");
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location,
1841                          "not vectorized: iteration count smaller than user "
1842                          "specified loop bound parameter or minimum profitable "
1843                          "iterations (whichever is more conservative).\n");
1844       return 0;
1845     }
1846
1847   /* The static profitablity threshold min_profitable_estimate includes
1848      the cost of having to check at runtime whether the scalar loop
1849      should be used instead.  If it turns out that we don't need or want
1850      such a check, the threshold we should use for the static estimate
1851      is simply the point at which the vector loop becomes more profitable
1852      than the scalar loop.  */
1853   if (min_profitable_estimate > min_profitable_iters
1854       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1855       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1856       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1861                          " choice between the scalar and vector loops\n");
1862       min_profitable_estimate = min_profitable_iters;
1863     }
1864
1865   HOST_WIDE_INT estimated_niter;
1866
1867   /* If we are vectorizing an epilogue then we know the maximum number of
1868      scalar iterations it will cover is at least one lower than the
1869      vectorization factor of the main loop.  */
1870   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1871     estimated_niter
1872       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1873   else
1874     {
1875       estimated_niter = estimated_stmt_executions_int (loop);
1876       if (estimated_niter == -1)
1877         estimated_niter = likely_max_stmt_executions_int (loop);
1878     }
1879   if (estimated_niter != -1
1880       && ((unsigned HOST_WIDE_INT) estimated_niter
1881           < MAX (th, (unsigned) min_profitable_estimate)))
1882     {
1883       if (dump_enabled_p ())
1884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885                          "not vectorized: estimated iteration count too "
1886                          "small.\n");
1887       if (dump_enabled_p ())
1888         dump_printf_loc (MSG_NOTE, vect_location,
1889                          "not vectorized: estimated iteration count smaller "
1890                          "than specified loop bound parameter or minimum "
1891                          "profitable iterations (whichever is more "
1892                          "conservative).\n");
1893       return -1;
1894     }
1895
1896   return 1;
1897 }
1898
1899 static opt_result
1900 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1901                            vec<data_reference_p> *datarefs,
1902                            unsigned int *n_stmts)
1903 {
1904   *n_stmts = 0;
1905   for (unsigned i = 0; i < loop->num_nodes; i++)
1906     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1907          !gsi_end_p (gsi); gsi_next (&gsi))
1908       {
1909         gimple *stmt = gsi_stmt (gsi);
1910         if (is_gimple_debug (stmt))
1911           continue;
1912         ++(*n_stmts);
1913         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1914                                                         NULL, 0);
1915         if (!res)
1916           {
1917             if (is_gimple_call (stmt) && loop->safelen)
1918               {
1919                 tree fndecl = gimple_call_fndecl (stmt), op;
1920                 if (fndecl != NULL_TREE)
1921                   {
1922                     cgraph_node *node = cgraph_node::get (fndecl);
1923                     if (node != NULL && node->simd_clones != NULL)
1924                       {
1925                         unsigned int j, n = gimple_call_num_args (stmt);
1926                         for (j = 0; j < n; j++)
1927                           {
1928                             op = gimple_call_arg (stmt, j);
1929                             if (DECL_P (op)
1930                                 || (REFERENCE_CLASS_P (op)
1931                                     && get_base_address (op)))
1932                               break;
1933                           }
1934                         op = gimple_call_lhs (stmt);
1935                         /* Ignore #pragma omp declare simd functions
1936                            if they don't have data references in the
1937                            call stmt itself.  */
1938                         if (j == n
1939                             && !(op
1940                                  && (DECL_P (op)
1941                                      || (REFERENCE_CLASS_P (op)
1942                                          && get_base_address (op)))))
1943                           continue;
1944                       }
1945                   }
1946               }
1947             return res;
1948           }
1949         /* If dependence analysis will give up due to the limit on the
1950            number of datarefs stop here and fail fatally.  */
1951         if (datarefs->length ()
1952             > (unsigned)param_loop_max_datarefs_for_datadeps)
1953           return opt_result::failure_at (stmt, "exceeded param "
1954                                          "loop-max-datarefs-for-datadeps\n");
1955       }
1956   return opt_result::success ();
1957 }
1958
1959 /* Look for SLP-only access groups and turn each individual access into its own
1960    group.  */
1961 static void
1962 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1963 {
1964   unsigned int i;
1965   struct data_reference *dr;
1966
1967   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1968
1969   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1970   FOR_EACH_VEC_ELT (datarefs, i, dr)
1971     {
1972       gcc_assert (DR_REF (dr));
1973       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1974
1975       /* Check if the load is a part of an interleaving chain.  */
1976       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1977         {
1978           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1979           unsigned int group_size = DR_GROUP_SIZE (first_element);
1980
1981           /* Check if SLP-only groups.  */
1982           if (!STMT_SLP_TYPE (stmt_info)
1983               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1984             {
1985               /* Dissolve the group.  */
1986               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1987
1988               stmt_vec_info vinfo = first_element;
1989               while (vinfo)
1990                 {
1991                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1992                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1993                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1994                   DR_GROUP_SIZE (vinfo) = 1;
1995                   if (STMT_VINFO_STRIDED_P (first_element))
1996                     DR_GROUP_GAP (vinfo) = 0;
1997                   else
1998                     DR_GROUP_GAP (vinfo) = group_size - 1;
1999                   vinfo = next;
2000                 }
2001             }
2002         }
2003     }
2004 }
2005
2006 /* Determine if operating on full vectors for LOOP_VINFO might leave
2007    some scalar iterations still to do.  If so, decide how we should
2008    handle those scalar iterations.  The possibilities are:
2009
2010    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2011        In this case:
2012
2013          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2014          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2015          LOOP_VINFO_PEELING_FOR_NITER == false
2016
2017    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2018        to handle the remaining scalar iterations.  In this case:
2019
2020          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2021          LOOP_VINFO_PEELING_FOR_NITER == true
2022
2023        There are two choices:
2024
2025        (2a) Consider vectorizing the epilogue loop at the same VF as the
2026             main loop, but using partial vectors instead of full vectors.
2027             In this case:
2028
2029               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2030
2031        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2032             In this case:
2033
2034               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2035
2036    When FOR_EPILOGUE_P is true, make this determination based on the
2037    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2038    based on the assumption that LOOP_VINFO is the main loop.  The caller
2039    has made sure that the number of iterations is set appropriately for
2040    this value of FOR_EPILOGUE_P.  */
2041
2042 opt_result
2043 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2044                                             bool for_epilogue_p)
2045 {
2046   /* Determine whether there would be any scalar iterations left over.  */
2047   bool need_peeling_or_partial_vectors_p
2048     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2049
2050   /* Decide whether to vectorize the loop with partial vectors.  */
2051   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2052   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2053   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2054       && need_peeling_or_partial_vectors_p)
2055     {
2056       /* For partial-vector-usage=1, try to push the handling of partial
2057          vectors to the epilogue, with the main loop continuing to operate
2058          on full vectors.
2059
2060          ??? We could then end up failing to use partial vectors if we
2061          decide to peel iterations into a prologue, and if the main loop
2062          then ends up processing fewer than VF iterations.  */
2063       if (param_vect_partial_vector_usage == 1
2064           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2065           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2066         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2067       else
2068         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2069     }
2070
2071   if (dump_enabled_p ())
2072     {
2073       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2074         dump_printf_loc (MSG_NOTE, vect_location,
2075                          "operating on partial vectors%s.\n",
2076                          for_epilogue_p ? " for epilogue loop" : "");
2077       else
2078         dump_printf_loc (MSG_NOTE, vect_location,
2079                          "operating only on full vectors%s.\n",
2080                          for_epilogue_p ? " for epilogue loop" : "");
2081     }
2082
2083   if (for_epilogue_p)
2084     {
2085       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2086       gcc_assert (orig_loop_vinfo);
2087       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2088         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2089                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2090     }
2091
2092   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2093       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2094     {
2095       /* Check that the loop processes at least one full vector.  */
2096       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2097       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2098       if (known_lt (wi::to_widest (scalar_niters), vf))
2099         return opt_result::failure_at (vect_location,
2100                                        "loop does not have enough iterations"
2101                                        " to support vectorization.\n");
2102
2103       /* If we need to peel an extra epilogue iteration to handle data
2104          accesses with gaps, check that there are enough scalar iterations
2105          available.
2106
2107          The check above is redundant with this one when peeling for gaps,
2108          but the distinction is useful for diagnostics.  */
2109       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2111           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2112         return opt_result::failure_at (vect_location,
2113                                        "loop does not have enough iterations"
2114                                        " to support peeling for gaps.\n");
2115     }
2116
2117   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2118     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2119        && need_peeling_or_partial_vectors_p);
2120
2121   return opt_result::success ();
2122 }
2123
2124 /* Function vect_analyze_loop_2.
2125
2126    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2127    for it.  The different analyses will record information in the
2128    loop_vec_info struct.  */
2129 static opt_result
2130 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2131 {
2132   opt_result ok = opt_result::success ();
2133   int res;
2134   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2135   poly_uint64 min_vf = 2;
2136   loop_vec_info orig_loop_vinfo = NULL;
2137
2138   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2139      loop_vec_info of the first vectorized loop.  */
2140   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2141     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2142   else
2143     orig_loop_vinfo = loop_vinfo;
2144   gcc_assert (orig_loop_vinfo);
2145
2146   /* The first group of checks is independent of the vector size.  */
2147   fatal = true;
2148
2149   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2150       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2151     return opt_result::failure_at (vect_location,
2152                                    "not vectorized: simd if(0)\n");
2153
2154   /* Find all data references in the loop (which correspond to vdefs/vuses)
2155      and analyze their evolution in the loop.  */
2156
2157   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2158
2159   /* Gather the data references and count stmts in the loop.  */
2160   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2161     {
2162       opt_result res
2163         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2164                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2165                                      n_stmts);
2166       if (!res)
2167         {
2168           if (dump_enabled_p ())
2169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170                              "not vectorized: loop contains function "
2171                              "calls or data references that cannot "
2172                              "be analyzed\n");
2173           return res;
2174         }
2175       loop_vinfo->shared->save_datarefs ();
2176     }
2177   else
2178     loop_vinfo->shared->check_datarefs ();
2179
2180   /* Analyze the data references and also adjust the minimal
2181      vectorization factor according to the loads and stores.  */
2182
2183   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2184   if (!ok)
2185     {
2186       if (dump_enabled_p ())
2187         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2188                          "bad data references.\n");
2189       return ok;
2190     }
2191
2192   /* Classify all cross-iteration scalar data-flow cycles.
2193      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2194   vect_analyze_scalar_cycles (loop_vinfo);
2195
2196   vect_pattern_recog (loop_vinfo);
2197
2198   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2199
2200   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2201      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2202
2203   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2204   if (!ok)
2205     {
2206       if (dump_enabled_p ())
2207         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208                          "bad data access.\n");
2209       return ok;
2210     }
2211
2212   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2213
2214   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2215   if (!ok)
2216     {
2217       if (dump_enabled_p ())
2218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219                          "unexpected pattern.\n");
2220       return ok;
2221     }
2222
2223   /* While the rest of the analysis below depends on it in some way.  */
2224   fatal = false;
2225
2226   /* Analyze data dependences between the data-refs in the loop
2227      and adjust the maximum vectorization factor according to
2228      the dependences.
2229      FORNOW: fail at the first data dependence that we encounter.  */
2230
2231   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2232   if (!ok)
2233     {
2234       if (dump_enabled_p ())
2235         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                          "bad data dependence.\n");
2237       return ok;
2238     }
2239   if (max_vf != MAX_VECTORIZATION_FACTOR
2240       && maybe_lt (max_vf, min_vf))
2241     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2242   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2243
2244   ok = vect_determine_vectorization_factor (loop_vinfo);
2245   if (!ok)
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "can't determine vectorization factor.\n");
2250       return ok;
2251     }
2252   if (max_vf != MAX_VECTORIZATION_FACTOR
2253       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2254     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2255
2256   /* Compute the scalar iteration cost.  */
2257   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2258
2259   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2260
2261   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2262   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2263   if (!ok)
2264     return ok;
2265
2266   /* If there are any SLP instances mark them as pure_slp.  */
2267   bool slp = vect_make_slp_decision (loop_vinfo);
2268   if (slp)
2269     {
2270       /* Find stmts that need to be both vectorized and SLPed.  */
2271       vect_detect_hybrid_slp (loop_vinfo);
2272
2273       /* Update the vectorization factor based on the SLP decision.  */
2274       vect_update_vf_for_slp (loop_vinfo);
2275
2276       /* Optimize the SLP graph with the vectorization factor fixed.  */
2277       vect_optimize_slp (loop_vinfo);
2278     }
2279
2280   bool saved_can_use_partial_vectors_p
2281     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2282
2283   /* We don't expect to have to roll back to anything other than an empty
2284      set of rgroups.  */
2285   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2286
2287   /* This is the point where we can re-start analysis with SLP forced off.  */
2288 start_over:
2289
2290   /* Now the vectorization factor is final.  */
2291   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2292   gcc_assert (known_ne (vectorization_factor, 0U));
2293
2294   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2295     {
2296       dump_printf_loc (MSG_NOTE, vect_location,
2297                        "vectorization_factor = ");
2298       dump_dec (MSG_NOTE, vectorization_factor);
2299       dump_printf (MSG_NOTE, ", niters = %wd\n",
2300                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2301     }
2302
2303   /* Analyze the alignment of the data-refs in the loop.
2304      Fail if a data reference is found that cannot be vectorized.  */
2305
2306   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2307   if (!ok)
2308     {
2309       if (dump_enabled_p ())
2310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                          "bad data alignment.\n");
2312       return ok;
2313     }
2314
2315   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2316      It is important to call pruning after vect_analyze_data_ref_accesses,
2317      since we use grouping information gathered by interleaving analysis.  */
2318   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2319   if (!ok)
2320     return ok;
2321
2322   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2323      vectorization, since we do not want to add extra peeling or
2324      add versioning for alignment.  */
2325   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2326     /* This pass will decide on using loop versioning and/or loop peeling in
2327        order to enhance the alignment of data references in the loop.  */
2328     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2329   if (!ok)
2330     return ok;
2331
2332   if (slp)
2333     {
2334       /* Analyze operations in the SLP instances.  Note this may
2335          remove unsupported SLP instances which makes the above
2336          SLP kind detection invalid.  */
2337       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2338       vect_slp_analyze_operations (loop_vinfo);
2339       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2340         {
2341           ok = opt_result::failure_at (vect_location,
2342                                        "unsupported SLP instances\n");
2343           goto again;
2344         }
2345     }
2346
2347   /* Dissolve SLP-only groups.  */
2348   vect_dissolve_slp_only_groups (loop_vinfo);
2349
2350   /* Scan all the remaining operations in the loop that are not subject
2351      to SLP and make sure they are vectorizable.  */
2352   ok = vect_analyze_loop_operations (loop_vinfo);
2353   if (!ok)
2354     {
2355       if (dump_enabled_p ())
2356         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2357                          "bad operation or unsupported loop bound.\n");
2358       return ok;
2359     }
2360
2361   /* For now, we don't expect to mix both masking and length approaches for one
2362      loop, disable it if both are recorded.  */
2363   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2364       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2365       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                          "can't vectorize a loop with partial vectors"
2370                          " because we don't expect to mix different"
2371                          " approaches with partial vectors for the"
2372                          " same loop.\n");
2373       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2374     }
2375
2376   /* If we still have the option of using partial vectors,
2377      check whether we can generate the necessary loop controls.  */
2378   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2379       && !vect_verify_full_masking (loop_vinfo)
2380       && !vect_verify_loop_lens (loop_vinfo))
2381     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2382
2383   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2384      to be able to handle fewer than VF scalars, or needs to have a lower VF
2385      than the main loop.  */
2386   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2387       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2388       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2389                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2390     return opt_result::failure_at (vect_location,
2391                                    "Vectorization factor too high for"
2392                                    " epilogue loop.\n");
2393
2394   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2395      assuming that the loop will be used as a main loop.  We will redo
2396      this analysis later if we instead decide to use the loop as an
2397      epilogue loop.  */
2398   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2399   if (!ok)
2400     return ok;
2401
2402   /* Check the costings of the loop make vectorizing worthwhile.  */
2403   res = vect_analyze_loop_costing (loop_vinfo);
2404   if (res < 0)
2405     {
2406       ok = opt_result::failure_at (vect_location,
2407                                    "Loop costings may not be worthwhile.\n");
2408       goto again;
2409     }
2410   if (!res)
2411     return opt_result::failure_at (vect_location,
2412                                    "Loop costings not worthwhile.\n");
2413
2414   /* If an epilogue loop is required make sure we can create one.  */
2415   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2416       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2417     {
2418       if (dump_enabled_p ())
2419         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2420       if (!vect_can_advance_ivs_p (loop_vinfo)
2421           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2422                                            single_exit (LOOP_VINFO_LOOP
2423                                                          (loop_vinfo))))
2424         {
2425           ok = opt_result::failure_at (vect_location,
2426                                        "not vectorized: can't create required "
2427                                        "epilog loop\n");
2428           goto again;
2429         }
2430     }
2431
2432   /* During peeling, we need to check if number of loop iterations is
2433      enough for both peeled prolog loop and vector loop.  This check
2434      can be merged along with threshold check of loop versioning, so
2435      increase threshold for this case if necessary.
2436
2437      If we are analyzing an epilogue we still want to check what its
2438      versioning threshold would be.  If we decide to vectorize the epilogues we
2439      will want to use the lowest versioning threshold of all epilogues and main
2440      loop.  This will enable us to enter a vectorized epilogue even when
2441      versioning the loop.  We can't simply check whether the epilogue requires
2442      versioning though since we may have skipped some versioning checks when
2443      analyzing the epilogue.  For instance, checks for alias versioning will be
2444      skipped when dealing with epilogues as we assume we already checked them
2445      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2446   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2447     {
2448       poly_uint64 niters_th = 0;
2449       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2450
2451       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2452         {
2453           /* Niters for peeled prolog loop.  */
2454           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2455             {
2456               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2457               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2458               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2459             }
2460           else
2461             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2462         }
2463
2464       /* Niters for at least one iteration of vectorized loop.  */
2465       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2466         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2467       /* One additional iteration because of peeling for gap.  */
2468       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2469         niters_th += 1;
2470
2471       /*  Use the same condition as vect_transform_loop to decide when to use
2472           the cost to determine a versioning threshold.  */
2473       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2474           && ordered_p (th, niters_th))
2475         niters_th = ordered_max (poly_uint64 (th), niters_th);
2476
2477       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2478     }
2479
2480   gcc_assert (known_eq (vectorization_factor,
2481                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2482
2483   /* Ok to vectorize!  */
2484   return opt_result::success ();
2485
2486 again:
2487   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2488   gcc_assert (!ok);
2489
2490   /* Try again with SLP forced off but if we didn't do any SLP there is
2491      no point in re-trying.  */
2492   if (!slp)
2493     return ok;
2494
2495   /* If there are reduction chains re-trying will fail anyway.  */
2496   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2497     return ok;
2498
2499   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2500      via interleaving or lane instructions.  */
2501   slp_instance instance;
2502   slp_tree node;
2503   unsigned i, j;
2504   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2505     {
2506       stmt_vec_info vinfo;
2507       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2508       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2509         continue;
2510       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2511       unsigned int size = DR_GROUP_SIZE (vinfo);
2512       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2513       if (! vect_store_lanes_supported (vectype, size, false)
2514          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2515          && ! vect_grouped_store_supported (vectype, size))
2516         return opt_result::failure_at (vinfo->stmt,
2517                                        "unsupported grouped store\n");
2518       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2519         {
2520           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2521           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2522           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2523           size = DR_GROUP_SIZE (vinfo);
2524           vectype = STMT_VINFO_VECTYPE (vinfo);
2525           if (! vect_load_lanes_supported (vectype, size, false)
2526               && ! vect_grouped_load_supported (vectype, single_element_p,
2527                                                 size))
2528             return opt_result::failure_at (vinfo->stmt,
2529                                            "unsupported grouped load\n");
2530         }
2531     }
2532
2533   if (dump_enabled_p ())
2534     dump_printf_loc (MSG_NOTE, vect_location,
2535                      "re-trying with SLP disabled\n");
2536
2537   /* Roll back state appropriately.  No SLP this time.  */
2538   slp = false;
2539   /* Restore vectorization factor as it were without SLP.  */
2540   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2541   /* Free the SLP instances.  */
2542   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2543     vect_free_slp_instance (instance);
2544   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2545   /* Reset SLP type to loop_vect on all stmts.  */
2546   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2547     {
2548       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2549       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2550            !gsi_end_p (si); gsi_next (&si))
2551         {
2552           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2553           STMT_SLP_TYPE (stmt_info) = loop_vect;
2554           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2555               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2556             {
2557               /* vectorizable_reduction adjusts reduction stmt def-types,
2558                  restore them to that of the PHI.  */
2559               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2560                 = STMT_VINFO_DEF_TYPE (stmt_info);
2561               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2562                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2563                 = STMT_VINFO_DEF_TYPE (stmt_info);
2564             }
2565         }
2566       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2567            !gsi_end_p (si); gsi_next (&si))
2568         {
2569           if (is_gimple_debug (gsi_stmt (si)))
2570             continue;
2571           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2572           STMT_SLP_TYPE (stmt_info) = loop_vect;
2573           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2574             {
2575               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2576               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2577               STMT_SLP_TYPE (stmt_info) = loop_vect;
2578               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2579                    !gsi_end_p (pi); gsi_next (&pi))
2580                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2581                   = loop_vect;
2582             }
2583         }
2584     }
2585   /* Free optimized alias test DDRS.  */
2586   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2587   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2588   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2589   /* Reset target cost data.  */
2590   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2591   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2592     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2593   /* Reset accumulated rgroup information.  */
2594   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2595   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2596   /* Reset assorted flags.  */
2597   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2598   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2599   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2600   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2601   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2602     = saved_can_use_partial_vectors_p;
2603
2604   goto start_over;
2605 }
2606
2607 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2608    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2609    OLD_LOOP_VINFO is better unless something specifically indicates
2610    otherwise.
2611
2612    Note that this deliberately isn't a partial order.  */
2613
2614 static bool
2615 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2616                           loop_vec_info old_loop_vinfo)
2617 {
2618   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2619   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2620
2621   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2622   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2623
2624   /* Always prefer a VF of loop->simdlen over any other VF.  */
2625   if (loop->simdlen)
2626     {
2627       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2628       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2629       if (new_simdlen_p != old_simdlen_p)
2630         return new_simdlen_p;
2631     }
2632
2633   /* Limit the VFs to what is likely to be the maximum number of iterations,
2634      to handle cases in which at least one loop_vinfo is fully-masked.  */
2635   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2636   if (estimated_max_niter != -1)
2637     {
2638       if (known_le (estimated_max_niter, new_vf))
2639         new_vf = estimated_max_niter;
2640       if (known_le (estimated_max_niter, old_vf))
2641         old_vf = estimated_max_niter;
2642     }
2643
2644   /* Check whether the (fractional) cost per scalar iteration is lower
2645      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2646   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2647                              * poly_widest_int (old_vf));
2648   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2649                              * poly_widest_int (new_vf));
2650   if (maybe_lt (rel_old, rel_new))
2651     {
2652       /* When old_loop_vinfo uses a variable vectorization factor,
2653          we know that it has a lower cost for at least one runtime VF.
2654          However, we don't know how likely that VF is.
2655
2656          One option would be to compare the costs for the estimated VFs.
2657          The problem is that that can put too much pressure on the cost
2658          model.  E.g. if the estimated VF is also the lowest possible VF,
2659          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2660          for the estimated VF, we'd then choose new_loop_vinfo even
2661          though (a) new_loop_vinfo might not actually be better than
2662          old_loop_vinfo for that VF and (b) it would be significantly
2663          worse at larger VFs.
2664
2665          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2666          no more expensive than old_loop_vinfo even after doubling the
2667          estimated old_loop_vinfo VF.  For all but trivial loops, this
2668          ensures that we only pick new_loop_vinfo if it is significantly
2669          better than old_loop_vinfo at the estimated VF.  */
2670       if (rel_new.is_constant ())
2671         return false;
2672
2673       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2674       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2675       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2676                                       * widest_int (old_estimated_vf));
2677       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2678                                       * widest_int (new_estimated_vf));
2679       return estimated_rel_new * 2 <= estimated_rel_old;
2680     }
2681   if (known_lt (rel_new, rel_old))
2682     return true;
2683
2684   /* If there's nothing to choose between the loop bodies, see whether
2685      there's a difference in the prologue and epilogue costs.  */
2686   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2687     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2688
2689   return false;
2690 }
2691
2692 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2693    true if we should.  */
2694
2695 static bool
2696 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2697                         loop_vec_info old_loop_vinfo)
2698 {
2699   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2700     return false;
2701
2702   if (dump_enabled_p ())
2703     dump_printf_loc (MSG_NOTE, vect_location,
2704                      "***** Preferring vector mode %s to vector mode %s\n",
2705                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2706                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2707   return true;
2708 }
2709
2710 /* Function vect_analyze_loop.
2711
2712    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2713    for it.  The different analyses will record information in the
2714    loop_vec_info struct.  */
2715 opt_loop_vec_info
2716 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2717 {
2718   auto_vector_modes vector_modes;
2719
2720   /* Autodetect first vector size we try.  */
2721   unsigned int autovec_flags
2722     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2723                                                     loop->simdlen != 0);
2724   unsigned int mode_i = 0;
2725
2726   DUMP_VECT_SCOPE ("analyze_loop_nest");
2727
2728   if (loop_outer (loop)
2729       && loop_vec_info_for_loop (loop_outer (loop))
2730       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2731     return opt_loop_vec_info::failure_at (vect_location,
2732                                           "outer-loop already vectorized.\n");
2733
2734   if (!find_loop_nest (loop, &shared->loop_nest))
2735     return opt_loop_vec_info::failure_at
2736       (vect_location,
2737        "not vectorized: loop nest containing two or more consecutive inner"
2738        " loops cannot be vectorized\n");
2739
2740   unsigned n_stmts = 0;
2741   machine_mode autodetected_vector_mode = VOIDmode;
2742   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2743   machine_mode next_vector_mode = VOIDmode;
2744   poly_uint64 lowest_th = 0;
2745   unsigned vectorized_loops = 0;
2746   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2747                              && !unlimited_cost_model (loop));
2748
2749   bool vect_epilogues = false;
2750   opt_result res = opt_result::success ();
2751   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2752   while (1)
2753     {
2754       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2755       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2756       if (!loop_vinfo)
2757         {
2758           if (dump_enabled_p ())
2759             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                              "bad loop form.\n");
2761           gcc_checking_assert (first_loop_vinfo == NULL);
2762           return loop_vinfo;
2763         }
2764       loop_vinfo->vector_mode = next_vector_mode;
2765
2766       bool fatal = false;
2767
2768       /* When pick_lowest_cost_p is true, we should in principle iterate
2769          over all the loop_vec_infos that LOOP_VINFO could replace and
2770          try to vectorize LOOP_VINFO under the same conditions.
2771          E.g. when trying to replace an epilogue loop, we should vectorize
2772          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2773          to replace the main loop, we should vectorize LOOP_VINFO as a main
2774          loop too.
2775
2776          However, autovectorize_vector_modes is usually sorted as follows:
2777
2778          - Modes that naturally produce lower VFs usually follow modes that
2779            naturally produce higher VFs.
2780
2781          - When modes naturally produce the same VF, maskable modes
2782            usually follow unmaskable ones, so that the maskable mode
2783            can be used to vectorize the epilogue of the unmaskable mode.
2784
2785          This order is preferred because it leads to the maximum
2786          epilogue vectorization opportunities.  Targets should only use
2787          a different order if they want to make wide modes available while
2788          disparaging them relative to earlier, smaller modes.  The assumption
2789          in that case is that the wider modes are more expensive in some
2790          way that isn't reflected directly in the costs.
2791
2792          There should therefore be few interesting cases in which
2793          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2794          treated as a standalone loop, and ends up being genuinely cheaper
2795          than FIRST_LOOP_VINFO.  */
2796       if (vect_epilogues)
2797         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2798
2799       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2800       if (mode_i == 0)
2801         autodetected_vector_mode = loop_vinfo->vector_mode;
2802       if (dump_enabled_p ())
2803         {
2804           if (res)
2805             dump_printf_loc (MSG_NOTE, vect_location,
2806                              "***** Analysis succeeded with vector mode %s\n",
2807                              GET_MODE_NAME (loop_vinfo->vector_mode));
2808           else
2809             dump_printf_loc (MSG_NOTE, vect_location,
2810                              "***** Analysis failed with vector mode %s\n",
2811                              GET_MODE_NAME (loop_vinfo->vector_mode));
2812         }
2813
2814       loop->aux = NULL;
2815
2816       if (!fatal)
2817         while (mode_i < vector_modes.length ()
2818                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2819           {
2820             if (dump_enabled_p ())
2821               dump_printf_loc (MSG_NOTE, vect_location,
2822                                "***** The result for vector mode %s would"
2823                                " be the same\n",
2824                                GET_MODE_NAME (vector_modes[mode_i]));
2825             mode_i += 1;
2826           }
2827
2828       if (res)
2829         {
2830           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2831           vectorized_loops++;
2832
2833           /* Once we hit the desired simdlen for the first time,
2834              discard any previous attempts.  */
2835           if (simdlen
2836               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2837             {
2838               delete first_loop_vinfo;
2839               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2840               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2841               simdlen = 0;
2842             }
2843           else if (pick_lowest_cost_p && first_loop_vinfo)
2844             {
2845               /* Keep trying to roll back vectorization attempts while the
2846                  loop_vec_infos they produced were worse than this one.  */
2847               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2848               while (!vinfos.is_empty ()
2849                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2850                 {
2851                   gcc_assert (vect_epilogues);
2852                   delete vinfos.pop ();
2853                 }
2854               if (vinfos.is_empty ()
2855                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2856                 {
2857                   delete first_loop_vinfo;
2858                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2859                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2860                 }
2861             }
2862
2863           if (first_loop_vinfo == NULL)
2864             {
2865               first_loop_vinfo = loop_vinfo;
2866               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2867             }
2868           else if (vect_epilogues
2869                    /* For now only allow one epilogue loop.  */
2870                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2871             {
2872               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2873               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2874               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2875                           || maybe_ne (lowest_th, 0U));
2876               /* Keep track of the known smallest versioning
2877                  threshold.  */
2878               if (ordered_p (lowest_th, th))
2879                 lowest_th = ordered_min (lowest_th, th);
2880             }
2881           else
2882             {
2883               delete loop_vinfo;
2884               loop_vinfo = opt_loop_vec_info::success (NULL);
2885             }
2886
2887           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2888              enabled, SIMDUID is not set, it is the innermost loop and we have
2889              either already found the loop's SIMDLEN or there was no SIMDLEN to
2890              begin with.
2891              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2892           vect_epilogues = (!simdlen
2893                             && loop->inner == NULL
2894                             && param_vect_epilogues_nomask
2895                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2896                             && !loop->simduid
2897                             /* For now only allow one epilogue loop, but allow
2898                                pick_lowest_cost_p to replace it.  */
2899                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2900                                 || pick_lowest_cost_p));
2901
2902           /* Commit to first_loop_vinfo if we have no reason to try
2903              alternatives.  */
2904           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2905             break;
2906         }
2907       else
2908         {
2909           delete loop_vinfo;
2910           loop_vinfo = opt_loop_vec_info::success (NULL);
2911           if (fatal)
2912             {
2913               gcc_checking_assert (first_loop_vinfo == NULL);
2914               break;
2915             }
2916         }
2917
2918       /* Handle the case that the original loop can use partial
2919          vectorization, but want to only adopt it for the epilogue.
2920          The retry should be in the same mode as original.  */
2921       if (vect_epilogues
2922           && loop_vinfo
2923           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2924         {
2925           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2926                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2927           if (dump_enabled_p ())
2928             dump_printf_loc (MSG_NOTE, vect_location,
2929                              "***** Re-trying analysis with same vector mode"
2930                              " %s for epilogue with partial vectors.\n",
2931                              GET_MODE_NAME (loop_vinfo->vector_mode));
2932           continue;
2933         }
2934
2935       if (mode_i < vector_modes.length ()
2936           && VECTOR_MODE_P (autodetected_vector_mode)
2937           && (related_vector_mode (vector_modes[mode_i],
2938                                    GET_MODE_INNER (autodetected_vector_mode))
2939               == autodetected_vector_mode)
2940           && (related_vector_mode (autodetected_vector_mode,
2941                                    GET_MODE_INNER (vector_modes[mode_i]))
2942               == vector_modes[mode_i]))
2943         {
2944           if (dump_enabled_p ())
2945             dump_printf_loc (MSG_NOTE, vect_location,
2946                              "***** Skipping vector mode %s, which would"
2947                              " repeat the analysis for %s\n",
2948                              GET_MODE_NAME (vector_modes[mode_i]),
2949                              GET_MODE_NAME (autodetected_vector_mode));
2950           mode_i += 1;
2951         }
2952
2953       if (mode_i == vector_modes.length ()
2954           || autodetected_vector_mode == VOIDmode)
2955         break;
2956
2957       /* Try the next biggest vector size.  */
2958       next_vector_mode = vector_modes[mode_i++];
2959       if (dump_enabled_p ())
2960         dump_printf_loc (MSG_NOTE, vect_location,
2961                          "***** Re-trying analysis with vector mode %s\n",
2962                          GET_MODE_NAME (next_vector_mode));
2963     }
2964
2965   if (first_loop_vinfo)
2966     {
2967       loop->aux = (loop_vec_info) first_loop_vinfo;
2968       if (dump_enabled_p ())
2969         dump_printf_loc (MSG_NOTE, vect_location,
2970                          "***** Choosing vector mode %s\n",
2971                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2972       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2973       return first_loop_vinfo;
2974     }
2975
2976   return opt_loop_vec_info::propagate_failure (res);
2977 }
2978
2979 /* Return true if there is an in-order reduction function for CODE, storing
2980    it in *REDUC_FN if so.  */
2981
2982 static bool
2983 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2984 {
2985   switch (code)
2986     {
2987     case PLUS_EXPR:
2988       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2989       return true;
2990
2991     default:
2992       return false;
2993     }
2994 }
2995
2996 /* Function reduction_fn_for_scalar_code
2997
2998    Input:
2999    CODE - tree_code of a reduction operations.
3000
3001    Output:
3002    REDUC_FN - the corresponding internal function to be used to reduce the
3003       vector of partial results into a single scalar result, or IFN_LAST
3004       if the operation is a supported reduction operation, but does not have
3005       such an internal function.
3006
3007    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3008
3009 static bool
3010 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3011 {
3012   switch (code)
3013     {
3014       case MAX_EXPR:
3015         *reduc_fn = IFN_REDUC_MAX;
3016         return true;
3017
3018       case MIN_EXPR:
3019         *reduc_fn = IFN_REDUC_MIN;
3020         return true;
3021
3022       case PLUS_EXPR:
3023         *reduc_fn = IFN_REDUC_PLUS;
3024         return true;
3025
3026       case BIT_AND_EXPR:
3027         *reduc_fn = IFN_REDUC_AND;
3028         return true;
3029
3030       case BIT_IOR_EXPR:
3031         *reduc_fn = IFN_REDUC_IOR;
3032         return true;
3033
3034       case BIT_XOR_EXPR:
3035         *reduc_fn = IFN_REDUC_XOR;
3036         return true;
3037
3038       case MULT_EXPR:
3039       case MINUS_EXPR:
3040         *reduc_fn = IFN_LAST;
3041         return true;
3042
3043       default:
3044        return false;
3045     }
3046 }
3047
3048 /* If there is a neutral value X such that SLP reduction NODE would not
3049    be affected by the introduction of additional X elements, return that X,
3050    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3051    is the vector type that would hold element X.  REDUC_CHAIN is true if
3052    the SLP statements perform a single reduction, false if each statement
3053    performs an independent reduction.  */
3054
3055 static tree
3056 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3057                               tree_code code, bool reduc_chain)
3058 {
3059   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3060   stmt_vec_info stmt_vinfo = stmts[0];
3061   tree scalar_type = TREE_TYPE (vector_type);
3062   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3063   gcc_assert (loop);
3064
3065   switch (code)
3066     {
3067     case WIDEN_SUM_EXPR:
3068     case DOT_PROD_EXPR:
3069     case SAD_EXPR:
3070     case PLUS_EXPR:
3071     case MINUS_EXPR:
3072     case BIT_IOR_EXPR:
3073     case BIT_XOR_EXPR:
3074       return build_zero_cst (scalar_type);
3075
3076     case MULT_EXPR:
3077       return build_one_cst (scalar_type);
3078
3079     case BIT_AND_EXPR:
3080       return build_all_ones_cst (scalar_type);
3081
3082     case MAX_EXPR:
3083     case MIN_EXPR:
3084       /* For MIN/MAX the initial values are neutral.  A reduction chain
3085          has only a single initial value, so that value is neutral for
3086          all statements.  */
3087       if (reduc_chain)
3088         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3089                                       loop_preheader_edge (loop));
3090       return NULL_TREE;
3091
3092     default:
3093       return NULL_TREE;
3094     }
3095 }
3096
3097 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3098    STMT is printed with a message MSG. */
3099
3100 static void
3101 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3102 {
3103   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3104 }
3105
3106 /* Return true if we need an in-order reduction for operation CODE
3107    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3108    overflow must wrap.  */
3109
3110 bool
3111 needs_fold_left_reduction_p (tree type, tree_code code)
3112 {
3113   /* CHECKME: check for !flag_finite_math_only too?  */
3114   if (SCALAR_FLOAT_TYPE_P (type))
3115     switch (code)
3116       {
3117       case MIN_EXPR:
3118       case MAX_EXPR:
3119         return false;
3120
3121       default:
3122         return !flag_associative_math;
3123       }
3124
3125   if (INTEGRAL_TYPE_P (type))
3126     {
3127       if (!operation_no_trapping_overflow (type, code))
3128         return true;
3129       return false;
3130     }
3131
3132   if (SAT_FIXED_POINT_TYPE_P (type))
3133     return true;
3134
3135   return false;
3136 }
3137
3138 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3139    has a handled computation expression.  Store the main reduction
3140    operation in *CODE.  */
3141
3142 static bool
3143 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3144                       tree loop_arg, enum tree_code *code,
3145                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3146 {
3147   auto_bitmap visited;
3148   tree lookfor = PHI_RESULT (phi);
3149   ssa_op_iter curri;
3150   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3151   while (USE_FROM_PTR (curr) != loop_arg)
3152     curr = op_iter_next_use (&curri);
3153   curri.i = curri.numops;
3154   do
3155     {
3156       path.safe_push (std::make_pair (curri, curr));
3157       tree use = USE_FROM_PTR (curr);
3158       if (use == lookfor)
3159         break;
3160       gimple *def = SSA_NAME_DEF_STMT (use);
3161       if (gimple_nop_p (def)
3162           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3163         {
3164 pop:
3165           do
3166             {
3167               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3168               curri = x.first;
3169               curr = x.second;
3170               do
3171                 curr = op_iter_next_use (&curri);
3172               /* Skip already visited or non-SSA operands (from iterating
3173                  over PHI args).  */
3174               while (curr != NULL_USE_OPERAND_P
3175                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3176                          || ! bitmap_set_bit (visited,
3177                                               SSA_NAME_VERSION
3178                                                 (USE_FROM_PTR (curr)))));
3179             }
3180           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3181           if (curr == NULL_USE_OPERAND_P)
3182             break;
3183         }
3184       else
3185         {
3186           if (gimple_code (def) == GIMPLE_PHI)
3187             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3188           else
3189             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3190           while (curr != NULL_USE_OPERAND_P
3191                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3192                      || ! bitmap_set_bit (visited,
3193                                           SSA_NAME_VERSION
3194                                             (USE_FROM_PTR (curr)))))
3195             curr = op_iter_next_use (&curri);
3196           if (curr == NULL_USE_OPERAND_P)
3197             goto pop;
3198         }
3199     }
3200   while (1);
3201   if (dump_file && (dump_flags & TDF_DETAILS))
3202     {
3203       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3204       unsigned i;
3205       std::pair<ssa_op_iter, use_operand_p> *x;
3206       FOR_EACH_VEC_ELT (path, i, x)
3207         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3208       dump_printf (MSG_NOTE, "\n");
3209     }
3210
3211   /* Check whether the reduction path detected is valid.  */
3212   bool fail = path.length () == 0;
3213   bool neg = false;
3214   int sign = -1;
3215   *code = ERROR_MARK;
3216   for (unsigned i = 1; i < path.length (); ++i)
3217     {
3218       gimple *use_stmt = USE_STMT (path[i].second);
3219       tree op = USE_FROM_PTR (path[i].second);
3220       if (! is_gimple_assign (use_stmt)
3221           /* The following make sure we can compute the operand index
3222              easily plus it mostly disallows chaining via COND_EXPR condition
3223              operands.  */
3224           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3225               && (gimple_num_ops (use_stmt) <= 2
3226                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3227               && (gimple_num_ops (use_stmt) <= 3
3228                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3229         {
3230           fail = true;
3231           break;
3232         }
3233       /* Check there's only a single stmt the op is used on inside
3234          of the loop.  */
3235       imm_use_iterator imm_iter;
3236       gimple *op_use_stmt;
3237       unsigned cnt = 0;
3238       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3239         if (!is_gimple_debug (op_use_stmt)
3240             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3241           {
3242             /* We want to allow x + x but not x < 1 ? x : 2.  */
3243             if (is_gimple_assign (op_use_stmt)
3244                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3245               {
3246                 use_operand_p use_p;
3247                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3248                   cnt++;
3249               }
3250             else
3251               cnt++;
3252           }
3253       if (cnt != 1)
3254         {
3255           fail = true;
3256           break;
3257         }
3258       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3259       if (use_code == MINUS_EXPR)
3260         {
3261           use_code = PLUS_EXPR;
3262           /* Track whether we negate the reduction value each iteration.  */
3263           if (gimple_assign_rhs2 (use_stmt) == op)
3264             neg = ! neg;
3265         }
3266       if (CONVERT_EXPR_CODE_P (use_code)
3267           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3268                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3269         ;
3270       else if (*code == ERROR_MARK)
3271         {
3272           *code = use_code;
3273           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3274         }
3275       else if (use_code != *code)
3276         {
3277           fail = true;
3278           break;
3279         }
3280       else if ((use_code == MIN_EXPR
3281                 || use_code == MAX_EXPR)
3282                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3283         {
3284           fail = true;
3285           break;
3286         }
3287     }
3288   return ! fail && ! neg && *code != ERROR_MARK;
3289 }
3290
3291 bool
3292 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3293                       tree loop_arg, enum tree_code code)
3294 {
3295   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3296   enum tree_code code_;
3297   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3298           && code_ == code);
3299 }
3300
3301
3302
3303 /* Function vect_is_simple_reduction
3304
3305    (1) Detect a cross-iteration def-use cycle that represents a simple
3306    reduction computation.  We look for the following pattern:
3307
3308    loop_header:
3309      a1 = phi < a0, a2 >
3310      a3 = ...
3311      a2 = operation (a3, a1)
3312
3313    or
3314
3315    a3 = ...
3316    loop_header:
3317      a1 = phi < a0, a2 >
3318      a2 = operation (a3, a1)
3319
3320    such that:
3321    1. operation is commutative and associative and it is safe to
3322       change the order of the computation
3323    2. no uses for a2 in the loop (a2 is used out of the loop)
3324    3. no uses of a1 in the loop besides the reduction operation
3325    4. no uses of a1 outside the loop.
3326
3327    Conditions 1,4 are tested here.
3328    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3329
3330    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3331    nested cycles.
3332
3333    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3334    reductions:
3335
3336      a1 = phi < a0, a2 >
3337      inner loop (def of a3)
3338      a2 = phi < a3 >
3339
3340    (4) Detect condition expressions, ie:
3341      for (int i = 0; i < N; i++)
3342        if (a[i] < val)
3343         ret_val = a[i];
3344
3345 */
3346
3347 static stmt_vec_info
3348 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3349                           bool *double_reduc, bool *reduc_chain_p)
3350 {
3351   gphi *phi = as_a <gphi *> (phi_info->stmt);
3352   gimple *phi_use_stmt = NULL;
3353   imm_use_iterator imm_iter;
3354   use_operand_p use_p;
3355
3356   *double_reduc = false;
3357   *reduc_chain_p = false;
3358   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3359
3360   tree phi_name = PHI_RESULT (phi);
3361   /* ???  If there are no uses of the PHI result the inner loop reduction
3362      won't be detected as possibly double-reduction by vectorizable_reduction
3363      because that tries to walk the PHI arg from the preheader edge which
3364      can be constant.  See PR60382.  */
3365   if (has_zero_uses (phi_name))
3366     return NULL;
3367   class loop *loop = (gimple_bb (phi))->loop_father;
3368   unsigned nphi_def_loop_uses = 0;
3369   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3370     {
3371       gimple *use_stmt = USE_STMT (use_p);
3372       if (is_gimple_debug (use_stmt))
3373         continue;
3374
3375       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3376         {
3377           if (dump_enabled_p ())
3378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3379                              "intermediate value used outside loop.\n");
3380
3381           return NULL;
3382         }
3383
3384       nphi_def_loop_uses++;
3385       phi_use_stmt = use_stmt;
3386     }
3387
3388   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3389   if (TREE_CODE (latch_def) != SSA_NAME)
3390     {
3391       if (dump_enabled_p ())
3392         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3393                          "reduction: not ssa_name: %T\n", latch_def);
3394       return NULL;
3395     }
3396
3397   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3398   if (!def_stmt_info
3399       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3400     return NULL;
3401
3402   bool nested_in_vect_loop
3403     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3404   unsigned nlatch_def_loop_uses = 0;
3405   auto_vec<gphi *, 3> lcphis;
3406   bool inner_loop_of_double_reduc = false;
3407   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3408     {
3409       gimple *use_stmt = USE_STMT (use_p);
3410       if (is_gimple_debug (use_stmt))
3411         continue;
3412       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3413         nlatch_def_loop_uses++;
3414       else
3415         {
3416           /* We can have more than one loop-closed PHI.  */
3417           lcphis.safe_push (as_a <gphi *> (use_stmt));
3418           if (nested_in_vect_loop
3419               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3420                   == vect_double_reduction_def))
3421             inner_loop_of_double_reduc = true;
3422         }
3423     }
3424
3425   /* If we are vectorizing an inner reduction we are executing that
3426      in the original order only in case we are not dealing with a
3427      double reduction.  */
3428   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3429     {
3430       if (dump_enabled_p ())
3431         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3432                         "detected nested cycle: ");
3433       return def_stmt_info;
3434     }
3435
3436   /* If this isn't a nested cycle or if the nested cycle reduction value
3437      is used ouside of the inner loop we cannot handle uses of the reduction
3438      value.  */
3439   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3440     {
3441       if (dump_enabled_p ())
3442         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3443                          "reduction used in loop.\n");
3444       return NULL;
3445     }
3446
3447   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3448      defined in the inner loop.  */
3449   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3450     {
3451       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3452       if (gimple_phi_num_args (def_stmt) != 1
3453           || TREE_CODE (op1) != SSA_NAME)
3454         {
3455           if (dump_enabled_p ())
3456             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457                              "unsupported phi node definition.\n");
3458
3459           return NULL;
3460         }
3461
3462       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3463       if (gimple_bb (def1)
3464           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3465           && loop->inner
3466           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3467           && is_gimple_assign (def1)
3468           && is_a <gphi *> (phi_use_stmt)
3469           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3470         {
3471           if (dump_enabled_p ())
3472             report_vect_op (MSG_NOTE, def_stmt,
3473                             "detected double reduction: ");
3474
3475           *double_reduc = true;
3476           return def_stmt_info;
3477         }
3478
3479       return NULL;
3480     }
3481
3482   /* Look for the expression computing latch_def from then loop PHI result.  */
3483   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3484   enum tree_code code;
3485   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3486                             path))
3487     {
3488       STMT_VINFO_REDUC_CODE (phi_info) = code;
3489       if (code == COND_EXPR && !nested_in_vect_loop)
3490         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3491
3492       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3493          reduction chain for which the additional restriction is that
3494          all operations in the chain are the same.  */
3495       auto_vec<stmt_vec_info, 8> reduc_chain;
3496       unsigned i;
3497       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3498       for (i = path.length () - 1; i >= 1; --i)
3499         {
3500           gimple *stmt = USE_STMT (path[i].second);
3501           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3502           STMT_VINFO_REDUC_IDX (stmt_info)
3503             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3504           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3505           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3506                                      && (i == 1 || i == path.length () - 1));
3507           if ((stmt_code != code && !leading_conversion)
3508               /* We can only handle the final value in epilogue
3509                  generation for reduction chains.  */
3510               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3511             is_slp_reduc = false;
3512           /* For reduction chains we support a trailing/leading
3513              conversions.  We do not store those in the actual chain.  */
3514           if (leading_conversion)
3515             continue;
3516           reduc_chain.safe_push (stmt_info);
3517         }
3518       if (is_slp_reduc && reduc_chain.length () > 1)
3519         {
3520           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3521             {
3522               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3523               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3524             }
3525           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3526           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3527
3528           /* Save the chain for further analysis in SLP detection.  */
3529           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3530           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3531
3532           *reduc_chain_p = true;
3533           if (dump_enabled_p ())
3534             dump_printf_loc (MSG_NOTE, vect_location,
3535                             "reduction: detected reduction chain\n");
3536         }
3537       else if (dump_enabled_p ())
3538         dump_printf_loc (MSG_NOTE, vect_location,
3539                          "reduction: detected reduction\n");
3540
3541       return def_stmt_info;
3542     }
3543
3544   if (dump_enabled_p ())
3545     dump_printf_loc (MSG_NOTE, vect_location,
3546                      "reduction: unknown pattern\n");
3547
3548   return NULL;
3549 }
3550
3551 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3552    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3553    or -1 if not known.  */
3554
3555 static int
3556 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3557 {
3558   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3559   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3560     {
3561       if (dump_enabled_p ())
3562         dump_printf_loc (MSG_NOTE, vect_location,
3563                          "cost model: epilogue peel iters set to vf/2 "
3564                          "because loop iterations are unknown .\n");
3565       return assumed_vf / 2;
3566     }
3567   else
3568     {
3569       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3570       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3571       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3572       /* If we need to peel for gaps, but no peeling is required, we have to
3573          peel VF iterations.  */
3574       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3575         peel_iters_epilogue = assumed_vf;
3576       return peel_iters_epilogue;
3577     }
3578 }
3579
3580 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3581 int
3582 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3583                              int *peel_iters_epilogue,
3584                              stmt_vector_for_cost *scalar_cost_vec,
3585                              stmt_vector_for_cost *prologue_cost_vec,
3586                              stmt_vector_for_cost *epilogue_cost_vec)
3587 {
3588   int retval = 0;
3589
3590   *peel_iters_epilogue
3591     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3592
3593   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3594     {
3595       /* If peeled iterations are known but number of scalar loop
3596          iterations are unknown, count a taken branch per peeled loop.  */
3597       if (peel_iters_prologue > 0)
3598         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3599                                    NULL, NULL_TREE, 0, vect_prologue);
3600       if (*peel_iters_epilogue > 0)
3601         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3602                                     NULL, NULL_TREE, 0, vect_epilogue);
3603     }
3604
3605   stmt_info_for_cost *si;
3606   int j;
3607   if (peel_iters_prologue)
3608     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3609       retval += record_stmt_cost (prologue_cost_vec,
3610                                   si->count * peel_iters_prologue,
3611                                   si->kind, si->stmt_info, si->misalign,
3612                                   vect_prologue);
3613   if (*peel_iters_epilogue)
3614     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3615       retval += record_stmt_cost (epilogue_cost_vec,
3616                                   si->count * *peel_iters_epilogue,
3617                                   si->kind, si->stmt_info, si->misalign,
3618                                   vect_epilogue);
3619
3620   return retval;
3621 }
3622
3623 /* Function vect_estimate_min_profitable_iters
3624
3625    Return the number of iterations required for the vector version of the
3626    loop to be profitable relative to the cost of the scalar version of the
3627    loop.
3628
3629    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3630    of iterations for vectorization.  -1 value means loop vectorization
3631    is not profitable.  This returned value may be used for dynamic
3632    profitability check.
3633
3634    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3635    for static check against estimated number of iterations.  */
3636
3637 static void
3638 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3639                                     int *ret_min_profitable_niters,
3640                                     int *ret_min_profitable_estimate)
3641 {
3642   int min_profitable_iters;
3643   int min_profitable_estimate;
3644   int peel_iters_prologue;
3645   int peel_iters_epilogue;
3646   unsigned vec_inside_cost = 0;
3647   int vec_outside_cost = 0;
3648   unsigned vec_prologue_cost = 0;
3649   unsigned vec_epilogue_cost = 0;
3650   int scalar_single_iter_cost = 0;
3651   int scalar_outside_cost = 0;
3652   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3653   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3654   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3655
3656   /* Cost model disabled.  */
3657   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3658     {
3659       if (dump_enabled_p ())
3660         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3661       *ret_min_profitable_niters = 0;
3662       *ret_min_profitable_estimate = 0;
3663       return;
3664     }
3665
3666   /* Requires loop versioning tests to handle misalignment.  */
3667   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3668     {
3669       /*  FIXME: Make cost depend on complexity of individual check.  */
3670       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3671       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3672                             NULL, NULL_TREE, 0, vect_prologue);
3673       if (dump_enabled_p ())
3674         dump_printf (MSG_NOTE,
3675                      "cost model: Adding cost of checks for loop "
3676                      "versioning to treat misalignment.\n");
3677     }
3678
3679   /* Requires loop versioning with alias checks.  */
3680   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3681     {
3682       /*  FIXME: Make cost depend on complexity of individual check.  */
3683       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3684       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3685                             NULL, NULL_TREE, 0, vect_prologue);
3686       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3687       if (len)
3688         /* Count LEN - 1 ANDs and LEN comparisons.  */
3689         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3690                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3691       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3692       if (len)
3693         {
3694           /* Count LEN - 1 ANDs and LEN comparisons.  */
3695           unsigned int nstmts = len * 2 - 1;
3696           /* +1 for each bias that needs adding.  */
3697           for (unsigned int i = 0; i < len; ++i)
3698             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3699               nstmts += 1;
3700           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3701                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3702         }
3703       if (dump_enabled_p ())
3704         dump_printf (MSG_NOTE,
3705                      "cost model: Adding cost of checks for loop "
3706                      "versioning aliasing.\n");
3707     }
3708
3709   /* Requires loop versioning with niter checks.  */
3710   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3711     {
3712       /*  FIXME: Make cost depend on complexity of individual check.  */
3713       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3714                             NULL, NULL_TREE, 0, vect_prologue);
3715       if (dump_enabled_p ())
3716         dump_printf (MSG_NOTE,
3717                      "cost model: Adding cost of checks for loop "
3718                      "versioning niters.\n");
3719     }
3720
3721   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3722     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3723                           NULL, NULL_TREE, 0, vect_prologue);
3724
3725   /* Count statements in scalar loop.  Using this as scalar cost for a single
3726      iteration for now.
3727
3728      TODO: Add outer loop support.
3729
3730      TODO: Consider assigning different costs to different scalar
3731      statements.  */
3732
3733   scalar_single_iter_cost
3734     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3735
3736   /* Add additional cost for the peeled instructions in prologue and epilogue
3737      loop.  (For fully-masked loops there will be no peeling.)
3738
3739      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3740      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3741
3742      TODO: Build an expression that represents peel_iters for prologue and
3743      epilogue to be used in a run-time test.  */
3744
3745   bool prologue_need_br_taken_cost = false;
3746   bool prologue_need_br_not_taken_cost = false;
3747
3748   /* Calculate peel_iters_prologue.  */
3749   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3750     peel_iters_prologue = 0;
3751   else if (npeel < 0)
3752     {
3753       peel_iters_prologue = assumed_vf / 2;
3754       if (dump_enabled_p ())
3755         dump_printf (MSG_NOTE, "cost model: "
3756                      "prologue peel iters set to vf/2.\n");
3757
3758       /* If peeled iterations are unknown, count a taken branch and a not taken
3759          branch per peeled loop.  Even if scalar loop iterations are known,
3760          vector iterations are not known since peeled prologue iterations are
3761          not known.  Hence guards remain the same.  */
3762       prologue_need_br_taken_cost = true;
3763       prologue_need_br_not_taken_cost = true;
3764     }
3765   else
3766     {
3767       peel_iters_prologue = npeel;
3768       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3769         /* If peeled iterations are known but number of scalar loop
3770            iterations are unknown, count a taken branch per peeled loop.  */
3771         prologue_need_br_taken_cost = true;
3772     }
3773
3774   bool epilogue_need_br_taken_cost = false;
3775   bool epilogue_need_br_not_taken_cost = false;
3776
3777   /* Calculate peel_iters_epilogue.  */
3778   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3779     /* We need to peel exactly one iteration for gaps.  */
3780     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3781   else if (npeel < 0)
3782     {
3783       /* If peeling for alignment is unknown, loop bound of main loop
3784          becomes unknown.  */
3785       peel_iters_epilogue = assumed_vf / 2;
3786       if (dump_enabled_p ())
3787         dump_printf (MSG_NOTE, "cost model: "
3788                      "epilogue peel iters set to vf/2 because "
3789                      "peeling for alignment is unknown.\n");
3790
3791       /* See the same reason above in peel_iters_prologue calculation.  */
3792       epilogue_need_br_taken_cost = true;
3793       epilogue_need_br_not_taken_cost = true;
3794     }
3795   else
3796     {
3797       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3798       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3799         /* If peeled iterations are known but number of scalar loop
3800            iterations are unknown, count a taken branch per peeled loop.  */
3801         epilogue_need_br_taken_cost = true;
3802     }
3803
3804   stmt_info_for_cost *si;
3805   int j;
3806   /* Add costs associated with peel_iters_prologue.  */
3807   if (peel_iters_prologue)
3808     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3809       {
3810         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3811                               si->count * peel_iters_prologue, si->kind,
3812                               si->stmt_info, si->vectype, si->misalign,
3813                               vect_prologue);
3814       }
3815
3816   /* Add costs associated with peel_iters_epilogue.  */
3817   if (peel_iters_epilogue)
3818     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3819       {
3820         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3821                               si->count * peel_iters_epilogue, si->kind,
3822                               si->stmt_info, si->vectype, si->misalign,
3823                               vect_epilogue);
3824       }
3825
3826   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3827
3828   if (prologue_need_br_taken_cost)
3829     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3830                           NULL, NULL_TREE, 0, vect_prologue);
3831
3832   if (prologue_need_br_not_taken_cost)
3833     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3834                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3835                           vect_prologue);
3836
3837   if (epilogue_need_br_taken_cost)
3838     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3839                           NULL, NULL_TREE, 0, vect_epilogue);
3840
3841   if (epilogue_need_br_not_taken_cost)
3842     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3843                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3844                           vect_epilogue);
3845
3846   /* Take care of special costs for rgroup controls of partial vectors.  */
3847   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3848     {
3849       /* Calculate how many masks we need to generate.  */
3850       unsigned int num_masks = 0;
3851       rgroup_controls *rgm;
3852       unsigned int num_vectors_m1;
3853       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3854         if (rgm->type)
3855           num_masks += num_vectors_m1 + 1;
3856       gcc_assert (num_masks > 0);
3857
3858       /* In the worst case, we need to generate each mask in the prologue
3859          and in the loop body.  One of the loop body mask instructions
3860          replaces the comparison in the scalar loop, and since we don't
3861          count the scalar comparison against the scalar body, we shouldn't
3862          count that vector instruction against the vector body either.
3863
3864          Sometimes we can use unpacks instead of generating prologue
3865          masks and sometimes the prologue mask will fold to a constant,
3866          so the actual prologue cost might be smaller.  However, it's
3867          simpler and safer to use the worst-case cost; if this ends up
3868          being the tie-breaker between vectorizing or not, then it's
3869          probably better not to vectorize.  */
3870       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3871                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3872       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3873                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
3874     }
3875   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3876     {
3877       /* Referring to the functions vect_set_loop_condition_partial_vectors
3878          and vect_set_loop_controls_directly, we need to generate each
3879          length in the prologue and in the loop body if required. Although
3880          there are some possible optimizations, we consider the worst case
3881          here.  */
3882
3883       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3884       bool need_iterate_p
3885         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3886            && !vect_known_niters_smaller_than_vf (loop_vinfo));
3887
3888       /* Calculate how many statements to be added.  */
3889       unsigned int prologue_stmts = 0;
3890       unsigned int body_stmts = 0;
3891
3892       rgroup_controls *rgc;
3893       unsigned int num_vectors_m1;
3894       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3895         if (rgc->type)
3896           {
3897             /* May need one SHIFT for nitems_total computation.  */
3898             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3899             if (nitems != 1 && !niters_known_p)
3900               prologue_stmts += 1;
3901
3902             /* May need one MAX and one MINUS for wrap around.  */
3903             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3904               prologue_stmts += 2;
3905
3906             /* Need one MAX and one MINUS for each batch limit excepting for
3907                the 1st one.  */
3908             prologue_stmts += num_vectors_m1 * 2;
3909
3910             unsigned int num_vectors = num_vectors_m1 + 1;
3911
3912             /* Need to set up lengths in prologue, only one MIN required
3913                for each since start index is zero.  */
3914             prologue_stmts += num_vectors;
3915
3916             /* Each may need two MINs and one MINUS to update lengths in body
3917                for next iteration.  */
3918             if (need_iterate_p)
3919               body_stmts += 3 * num_vectors;
3920           }
3921
3922       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3923                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3924       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3925                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3926     }
3927
3928   /* FORNOW: The scalar outside cost is incremented in one of the
3929      following ways:
3930
3931      1. The vectorizer checks for alignment and aliasing and generates
3932      a condition that allows dynamic vectorization.  A cost model
3933      check is ANDED with the versioning condition.  Hence scalar code
3934      path now has the added cost of the versioning check.
3935
3936        if (cost > th & versioning_check)
3937          jmp to vector code
3938
3939      Hence run-time scalar is incremented by not-taken branch cost.
3940
3941      2. The vectorizer then checks if a prologue is required.  If the
3942      cost model check was not done before during versioning, it has to
3943      be done before the prologue check.
3944
3945        if (cost <= th)
3946          prologue = scalar_iters
3947        if (prologue == 0)
3948          jmp to vector code
3949        else
3950          execute prologue
3951        if (prologue == num_iters)
3952          go to exit
3953
3954      Hence the run-time scalar cost is incremented by a taken branch,
3955      plus a not-taken branch, plus a taken branch cost.
3956
3957      3. The vectorizer then checks if an epilogue is required.  If the
3958      cost model check was not done before during prologue check, it
3959      has to be done with the epilogue check.
3960
3961        if (prologue == 0)
3962          jmp to vector code
3963        else
3964          execute prologue
3965        if (prologue == num_iters)
3966          go to exit
3967        vector code:
3968          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3969            jmp to epilogue
3970
3971      Hence the run-time scalar cost should be incremented by 2 taken
3972      branches.
3973
3974      TODO: The back end may reorder the BBS's differently and reverse
3975      conditions/branch directions.  Change the estimates below to
3976      something more reasonable.  */
3977
3978   /* If the number of iterations is known and we do not do versioning, we can
3979      decide whether to vectorize at compile time.  Hence the scalar version
3980      do not carry cost model guard costs.  */
3981   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3982       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3983     {
3984       /* Cost model check occurs at versioning.  */
3985       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3986         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3987       else
3988         {
3989           /* Cost model check occurs at prologue generation.  */
3990           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3991             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3992               + vect_get_stmt_cost (cond_branch_not_taken);
3993           /* Cost model check occurs at epilogue generation.  */
3994           else
3995             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3996         }
3997     }
3998
3999   /* Complete the target-specific cost calculations.  */
4000   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4001                &vec_inside_cost, &vec_epilogue_cost);
4002
4003   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4004
4005   /* Stash the costs so that we can compare two loop_vec_infos.  */
4006   loop_vinfo->vec_inside_cost = vec_inside_cost;
4007   loop_vinfo->vec_outside_cost = vec_outside_cost;
4008
4009   if (dump_enabled_p ())
4010     {
4011       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4012       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4013                    vec_inside_cost);
4014       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4015                    vec_prologue_cost);
4016       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4017                    vec_epilogue_cost);
4018       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4019                    scalar_single_iter_cost);
4020       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4021                    scalar_outside_cost);
4022       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4023                    vec_outside_cost);
4024       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4025                    peel_iters_prologue);
4026       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4027                    peel_iters_epilogue);
4028     }
4029
4030   /* Calculate number of iterations required to make the vector version
4031      profitable, relative to the loop bodies only.  The following condition
4032      must hold true:
4033      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4034      where
4035      SIC = scalar iteration cost, VIC = vector iteration cost,
4036      VOC = vector outside cost, VF = vectorization factor,
4037      NPEEL = prologue iterations + epilogue iterations,
4038      SOC = scalar outside cost for run time cost model check.  */
4039
4040   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4041                           - vec_inside_cost);
4042   if (saving_per_viter <= 0)
4043     {
4044       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4045         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4046                     "vectorization did not happen for a simd loop");
4047
4048       if (dump_enabled_p ())
4049         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4050                          "cost model: the vector iteration cost = %d "
4051                          "divided by the scalar iteration cost = %d "
4052                          "is greater or equal to the vectorization factor = %d"
4053                          ".\n",
4054                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4055       *ret_min_profitable_niters = -1;
4056       *ret_min_profitable_estimate = -1;
4057       return;
4058     }
4059
4060   /* ??? The "if" arm is written to handle all cases; see below for what
4061      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4062   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4063     {
4064       /* Rewriting the condition above in terms of the number of
4065          vector iterations (vniters) rather than the number of
4066          scalar iterations (niters) gives:
4067
4068          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4069
4070          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4071
4072          For integer N, X and Y when X > 0:
4073
4074          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4075       int outside_overhead = (vec_outside_cost
4076                               - scalar_single_iter_cost * peel_iters_prologue
4077                               - scalar_single_iter_cost * peel_iters_epilogue
4078                               - scalar_outside_cost);
4079       /* We're only interested in cases that require at least one
4080          vector iteration.  */
4081       int min_vec_niters = 1;
4082       if (outside_overhead > 0)
4083         min_vec_niters = outside_overhead / saving_per_viter + 1;
4084
4085       if (dump_enabled_p ())
4086         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4087                      min_vec_niters);
4088
4089       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4090         {
4091           /* Now that we know the minimum number of vector iterations,
4092              find the minimum niters for which the scalar cost is larger:
4093
4094              SIC * niters > VIC * vniters + VOC - SOC
4095
4096              We know that the minimum niters is no more than
4097              vniters * VF + NPEEL, but it might be (and often is) less
4098              than that if a partial vector iteration is cheaper than the
4099              equivalent scalar code.  */
4100           int threshold = (vec_inside_cost * min_vec_niters
4101                            + vec_outside_cost
4102                            - scalar_outside_cost);
4103           if (threshold <= 0)
4104             min_profitable_iters = 1;
4105           else
4106             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4107         }
4108       else
4109         /* Convert the number of vector iterations into a number of
4110            scalar iterations.  */
4111         min_profitable_iters = (min_vec_niters * assumed_vf
4112                                 + peel_iters_prologue
4113                                 + peel_iters_epilogue);
4114     }
4115   else
4116     {
4117       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4118                               * assumed_vf
4119                               - vec_inside_cost * peel_iters_prologue
4120                               - vec_inside_cost * peel_iters_epilogue);
4121       if (min_profitable_iters <= 0)
4122         min_profitable_iters = 0;
4123       else
4124         {
4125           min_profitable_iters /= saving_per_viter;
4126
4127           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4128               <= (((int) vec_inside_cost * min_profitable_iters)
4129                   + (((int) vec_outside_cost - scalar_outside_cost)
4130                      * assumed_vf)))
4131             min_profitable_iters++;
4132         }
4133     }
4134
4135   if (dump_enabled_p ())
4136     dump_printf (MSG_NOTE,
4137                  "  Calculated minimum iters for profitability: %d\n",
4138                  min_profitable_iters);
4139
4140   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4141       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4142     /* We want the vectorized loop to execute at least once.  */
4143     min_profitable_iters = assumed_vf + peel_iters_prologue;
4144   else if (min_profitable_iters < peel_iters_prologue)
4145     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4146        vectorized loop executes at least once.  */
4147     min_profitable_iters = peel_iters_prologue;
4148
4149   if (dump_enabled_p ())
4150     dump_printf_loc (MSG_NOTE, vect_location,
4151                      "  Runtime profitability threshold = %d\n",
4152                      min_profitable_iters);
4153
4154   *ret_min_profitable_niters = min_profitable_iters;
4155
4156   /* Calculate number of iterations required to make the vector version
4157      profitable, relative to the loop bodies only.
4158
4159      Non-vectorized variant is SIC * niters and it must win over vector
4160      variant on the expected loop trip count.  The following condition must hold true:
4161      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4162
4163   if (vec_outside_cost <= 0)
4164     min_profitable_estimate = 0;
4165   /* ??? This "else if" arm is written to handle all cases; see below for
4166      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4167   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4168     {
4169       /* This is a repeat of the code above, but with + SOC rather
4170          than - SOC.  */
4171       int outside_overhead = (vec_outside_cost
4172                               - scalar_single_iter_cost * peel_iters_prologue
4173                               - scalar_single_iter_cost * peel_iters_epilogue
4174                               + scalar_outside_cost);
4175       int min_vec_niters = 1;
4176       if (outside_overhead > 0)
4177         min_vec_niters = outside_overhead / saving_per_viter + 1;
4178
4179       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4180         {
4181           int threshold = (vec_inside_cost * min_vec_niters
4182                            + vec_outside_cost
4183                            + scalar_outside_cost);
4184           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4185         }
4186       else
4187         min_profitable_estimate = (min_vec_niters * assumed_vf
4188                                    + peel_iters_prologue
4189                                    + peel_iters_epilogue);
4190     }
4191   else
4192     {
4193       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4194                                  * assumed_vf
4195                                  - vec_inside_cost * peel_iters_prologue
4196                                  - vec_inside_cost * peel_iters_epilogue)
4197                                  / ((scalar_single_iter_cost * assumed_vf)
4198                                    - vec_inside_cost);
4199     }
4200   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4201   if (dump_enabled_p ())
4202     dump_printf_loc (MSG_NOTE, vect_location,
4203                      "  Static estimate profitability threshold = %d\n",
4204                      min_profitable_estimate);
4205
4206   *ret_min_profitable_estimate = min_profitable_estimate;
4207 }
4208
4209 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4210    vector elements (not bits) for a vector with NELT elements.  */
4211 static void
4212 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4213                               vec_perm_builder *sel)
4214 {
4215   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4216      by vec_perm_indices.  */
4217   sel->new_vector (nelt, 1, 3);
4218   for (unsigned int i = 0; i < 3; i++)
4219     sel->quick_push (i + offset);
4220 }
4221
4222 /* Checks whether the target supports whole-vector shifts for vectors of mode
4223    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4224    it supports vec_perm_const with masks for all necessary shift amounts.  */
4225 static bool
4226 have_whole_vector_shift (machine_mode mode)
4227 {
4228   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4229     return true;
4230
4231   /* Variable-length vectors should be handled via the optab.  */
4232   unsigned int nelt;
4233   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4234     return false;
4235
4236   vec_perm_builder sel;
4237   vec_perm_indices indices;
4238   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4239     {
4240       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4241       indices.new_vector (sel, 2, nelt);
4242       if (!can_vec_perm_const_p (mode, indices, false))
4243         return false;
4244     }
4245   return true;
4246 }
4247
4248 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4249    functions. Design better to avoid maintenance issues.  */
4250
4251 /* Function vect_model_reduction_cost.
4252
4253    Models cost for a reduction operation, including the vector ops
4254    generated within the strip-mine loop, the initial definition before
4255    the loop, and the epilogue code that must be generated.  */
4256
4257 static void
4258 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4259                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4260                            vect_reduction_type reduction_type,
4261                            int ncopies, stmt_vector_for_cost *cost_vec)
4262 {
4263   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4264   enum tree_code code;
4265   optab optab;
4266   tree vectype;
4267   machine_mode mode;
4268   class loop *loop = NULL;
4269
4270   if (loop_vinfo)
4271     loop = LOOP_VINFO_LOOP (loop_vinfo);
4272
4273   /* Condition reductions generate two reductions in the loop.  */
4274   if (reduction_type == COND_REDUCTION)
4275     ncopies *= 2;
4276
4277   vectype = STMT_VINFO_VECTYPE (stmt_info);
4278   mode = TYPE_MODE (vectype);
4279   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4280
4281   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4282
4283   if (reduction_type == EXTRACT_LAST_REDUCTION)
4284     /* No extra instructions are needed in the prologue.  The loop body
4285        operations are costed in vectorizable_condition.  */
4286     inside_cost = 0;
4287   else if (reduction_type == FOLD_LEFT_REDUCTION)
4288     {
4289       /* No extra instructions needed in the prologue.  */
4290       prologue_cost = 0;
4291
4292       if (reduc_fn != IFN_LAST)
4293         /* Count one reduction-like operation per vector.  */
4294         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4295                                         stmt_info, 0, vect_body);
4296       else
4297         {
4298           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4299           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4300           inside_cost = record_stmt_cost (cost_vec, nelements,
4301                                           vec_to_scalar, stmt_info, 0,
4302                                           vect_body);
4303           inside_cost += record_stmt_cost (cost_vec, nelements,
4304                                            scalar_stmt, stmt_info, 0,
4305                                            vect_body);
4306         }
4307     }
4308   else
4309     {
4310       /* Add in cost for initial definition.
4311          For cond reduction we have four vectors: initial index, step,
4312          initial result of the data reduction, initial value of the index
4313          reduction.  */
4314       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4315       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4316                                          scalar_to_vec, stmt_info, 0,
4317                                          vect_prologue);
4318
4319       /* Cost of reduction op inside loop.  */
4320       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4321                                       stmt_info, 0, vect_body);
4322     }
4323
4324   /* Determine cost of epilogue code.
4325
4326      We have a reduction operator that will reduce the vector in one statement.
4327      Also requires scalar extract.  */
4328
4329   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4330     {
4331       if (reduc_fn != IFN_LAST)
4332         {
4333           if (reduction_type == COND_REDUCTION)
4334             {
4335               /* An EQ stmt and an COND_EXPR stmt.  */
4336               epilogue_cost += record_stmt_cost (cost_vec, 2,
4337                                                  vector_stmt, stmt_info, 0,
4338                                                  vect_epilogue);
4339               /* Reduction of the max index and a reduction of the found
4340                  values.  */
4341               epilogue_cost += record_stmt_cost (cost_vec, 2,
4342                                                  vec_to_scalar, stmt_info, 0,
4343                                                  vect_epilogue);
4344               /* A broadcast of the max value.  */
4345               epilogue_cost += record_stmt_cost (cost_vec, 1,
4346                                                  scalar_to_vec, stmt_info, 0,
4347                                                  vect_epilogue);
4348             }
4349           else
4350             {
4351               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4352                                                  stmt_info, 0, vect_epilogue);
4353               epilogue_cost += record_stmt_cost (cost_vec, 1,
4354                                                  vec_to_scalar, stmt_info, 0,
4355                                                  vect_epilogue);
4356             }
4357         }
4358       else if (reduction_type == COND_REDUCTION)
4359         {
4360           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4361           /* Extraction of scalar elements.  */
4362           epilogue_cost += record_stmt_cost (cost_vec,
4363                                              2 * estimated_nunits,
4364                                              vec_to_scalar, stmt_info, 0,
4365                                              vect_epilogue);
4366           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4367           epilogue_cost += record_stmt_cost (cost_vec,
4368                                              2 * estimated_nunits - 3,
4369                                              scalar_stmt, stmt_info, 0,
4370                                              vect_epilogue);
4371         }
4372       else if (reduction_type == EXTRACT_LAST_REDUCTION
4373                || reduction_type == FOLD_LEFT_REDUCTION)
4374         /* No extra instructions need in the epilogue.  */
4375         ;
4376       else
4377         {
4378           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4379           tree bitsize =
4380             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4381           int element_bitsize = tree_to_uhwi (bitsize);
4382           int nelements = vec_size_in_bits / element_bitsize;
4383
4384           if (code == COND_EXPR)
4385             code = MAX_EXPR;
4386
4387           optab = optab_for_tree_code (code, vectype, optab_default);
4388
4389           /* We have a whole vector shift available.  */
4390           if (optab != unknown_optab
4391               && VECTOR_MODE_P (mode)
4392               && optab_handler (optab, mode) != CODE_FOR_nothing
4393               && have_whole_vector_shift (mode))
4394             {
4395               /* Final reduction via vector shifts and the reduction operator.
4396                  Also requires scalar extract.  */
4397               epilogue_cost += record_stmt_cost (cost_vec,
4398                                                  exact_log2 (nelements) * 2,
4399                                                  vector_stmt, stmt_info, 0,
4400                                                  vect_epilogue);
4401               epilogue_cost += record_stmt_cost (cost_vec, 1,
4402                                                  vec_to_scalar, stmt_info, 0,
4403                                                  vect_epilogue);
4404             }
4405           else
4406             /* Use extracts and reduction op for final reduction.  For N
4407                elements, we have N extracts and N-1 reduction ops.  */
4408             epilogue_cost += record_stmt_cost (cost_vec,
4409                                                nelements + nelements - 1,
4410                                                vector_stmt, stmt_info, 0,
4411                                                vect_epilogue);
4412         }
4413     }
4414
4415   if (dump_enabled_p ())
4416     dump_printf (MSG_NOTE,
4417                  "vect_model_reduction_cost: inside_cost = %d, "
4418                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4419                  prologue_cost, epilogue_cost);
4420 }
4421
4422
4423 /* Function vect_model_induction_cost.
4424
4425    Models cost for induction operations.  */
4426
4427 static void
4428 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4429                            stmt_vector_for_cost *cost_vec)
4430 {
4431   unsigned inside_cost, prologue_cost;
4432
4433   if (PURE_SLP_STMT (stmt_info))
4434     return;
4435
4436   /* loop cost for vec_loop.  */
4437   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4438                                   stmt_info, 0, vect_body);
4439
4440   /* prologue cost for vec_init and vec_step.  */
4441   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4442                                     stmt_info, 0, vect_prologue);
4443
4444   if (dump_enabled_p ())
4445     dump_printf_loc (MSG_NOTE, vect_location,
4446                      "vect_model_induction_cost: inside_cost = %d, "
4447                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4448 }
4449
4450
4451
4452 /* Function get_initial_def_for_reduction
4453
4454    Input:
4455    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4456    INIT_VAL - the initial value of the reduction variable
4457
4458    Output:
4459    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4460         of the reduction (used for adjusting the epilog - see below).
4461    Return a vector variable, initialized according to the operation that
4462         STMT_VINFO performs. This vector will be used as the initial value
4463         of the vector of partial results.
4464
4465    Option1 (adjust in epilog): Initialize the vector as follows:
4466      add/bit or/xor:    [0,0,...,0,0]
4467      mult/bit and:      [1,1,...,1,1]
4468      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4469    and when necessary (e.g. add/mult case) let the caller know
4470    that it needs to adjust the result by init_val.
4471
4472    Option2: Initialize the vector as follows:
4473      add/bit or/xor:    [init_val,0,0,...,0]
4474      mult/bit and:      [init_val,1,1,...,1]
4475      min/max/cond_expr: [init_val,init_val,...,init_val]
4476    and no adjustments are needed.
4477
4478    For example, for the following code:
4479
4480    s = init_val;
4481    for (i=0;i<n;i++)
4482      s = s + a[i];
4483
4484    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4485    For a vector of 4 units, we want to return either [0,0,0,init_val],
4486    or [0,0,0,0] and let the caller know that it needs to adjust
4487    the result at the end by 'init_val'.
4488
4489    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4490    initialization vector is simpler (same element in all entries), if
4491    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4492
4493    A cost model should help decide between these two schemes.  */
4494
4495 static tree
4496 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4497                                stmt_vec_info stmt_vinfo,
4498                                enum tree_code code, tree init_val,
4499                                tree *adjustment_def)
4500 {
4501   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4502   tree scalar_type = TREE_TYPE (init_val);
4503   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4504   tree def_for_init;
4505   tree init_def;
4506   REAL_VALUE_TYPE real_init_val = dconst0;
4507   int int_init_val = 0;
4508   gimple_seq stmts = NULL;
4509
4510   gcc_assert (vectype);
4511
4512   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4513               || SCALAR_FLOAT_TYPE_P (scalar_type));
4514
4515   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4516               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4517
4518   /* ADJUSTMENT_DEF is NULL when called from
4519      vect_create_epilog_for_reduction to vectorize double reduction.  */
4520   if (adjustment_def)
4521     *adjustment_def = NULL;
4522
4523   switch (code)
4524     {
4525     case WIDEN_SUM_EXPR:
4526     case DOT_PROD_EXPR:
4527     case SAD_EXPR:
4528     case PLUS_EXPR:
4529     case MINUS_EXPR:
4530     case BIT_IOR_EXPR:
4531     case BIT_XOR_EXPR:
4532     case MULT_EXPR:
4533     case BIT_AND_EXPR:
4534       {
4535         if (code == MULT_EXPR)
4536           {
4537             real_init_val = dconst1;
4538             int_init_val = 1;
4539           }
4540
4541         if (code == BIT_AND_EXPR)
4542           int_init_val = -1;
4543
4544         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4545           def_for_init = build_real (scalar_type, real_init_val);
4546         else
4547           def_for_init = build_int_cst (scalar_type, int_init_val);
4548
4549         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4550           {
4551             /* Option1: the first element is '0' or '1' as well.  */
4552             if (!operand_equal_p (def_for_init, init_val, 0))
4553               *adjustment_def = init_val;
4554             init_def = gimple_build_vector_from_val (&stmts, vectype,
4555                                                      def_for_init);
4556           }
4557         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4558           {
4559             /* Option2 (variable length): the first element is INIT_VAL.  */
4560             init_def = gimple_build_vector_from_val (&stmts, vectype,
4561                                                      def_for_init);
4562             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4563                                      vectype, init_def, init_val);
4564           }
4565         else
4566           {
4567             /* Option2: the first element is INIT_VAL.  */
4568             tree_vector_builder elts (vectype, 1, 2);
4569             elts.quick_push (init_val);
4570             elts.quick_push (def_for_init);
4571             init_def = gimple_build_vector (&stmts, &elts);
4572           }
4573       }
4574       break;
4575
4576     case MIN_EXPR:
4577     case MAX_EXPR:
4578     case COND_EXPR:
4579       {
4580         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4581         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4582       }
4583       break;
4584
4585     default:
4586       gcc_unreachable ();
4587     }
4588
4589   if (stmts)
4590     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4591   return init_def;
4592 }
4593
4594 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4595    NUMBER_OF_VECTORS is the number of vector defs to create.
4596    If NEUTRAL_OP is nonnull, introducing extra elements of that
4597    value will not change the result.  */
4598
4599 static void
4600 get_initial_defs_for_reduction (vec_info *vinfo,
4601                                 slp_tree slp_node,
4602                                 vec<tree> *vec_oprnds,
4603                                 unsigned int number_of_vectors,
4604                                 bool reduc_chain, tree neutral_op)
4605 {
4606   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4607   stmt_vec_info stmt_vinfo = stmts[0];
4608   unsigned HOST_WIDE_INT nunits;
4609   unsigned j, number_of_places_left_in_vector;
4610   tree vector_type;
4611   unsigned int group_size = stmts.length ();
4612   unsigned int i;
4613   class loop *loop;
4614
4615   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4616
4617   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4618
4619   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4620   gcc_assert (loop);
4621   edge pe = loop_preheader_edge (loop);
4622
4623   gcc_assert (!reduc_chain || neutral_op);
4624
4625   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4626      created vectors. It is greater than 1 if unrolling is performed.
4627
4628      For example, we have two scalar operands, s1 and s2 (e.g., group of
4629      strided accesses of size two), while NUNITS is four (i.e., four scalars
4630      of this type can be packed in a vector).  The output vector will contain
4631      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4632      will be 2).
4633
4634      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4635      vectors containing the operands.
4636
4637      For example, NUNITS is four as before, and the group size is 8
4638      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4639      {s5, s6, s7, s8}.  */
4640
4641   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4642     nunits = group_size;
4643
4644   number_of_places_left_in_vector = nunits;
4645   bool constant_p = true;
4646   tree_vector_builder elts (vector_type, nunits, 1);
4647   elts.quick_grow (nunits);
4648   gimple_seq ctor_seq = NULL;
4649   for (j = 0; j < nunits * number_of_vectors; ++j)
4650     {
4651       tree op;
4652       i = j % group_size;
4653       stmt_vinfo = stmts[i];
4654
4655       /* Get the def before the loop.  In reduction chain we have only
4656          one initial value.  Else we have as many as PHIs in the group.  */
4657       if (reduc_chain)
4658         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4659       else if (((vec_oprnds->length () + 1) * nunits
4660                 - number_of_places_left_in_vector >= group_size)
4661                && neutral_op)
4662         op = neutral_op;
4663       else
4664         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4665
4666       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4667       number_of_places_left_in_vector--;
4668       elts[nunits - number_of_places_left_in_vector - 1] = op;
4669       if (!CONSTANT_CLASS_P (op))
4670         constant_p = false;
4671
4672       if (number_of_places_left_in_vector == 0)
4673         {
4674           tree init;
4675           if (constant_p && !neutral_op
4676               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4677               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4678             /* Build the vector directly from ELTS.  */
4679             init = gimple_build_vector (&ctor_seq, &elts);
4680           else if (neutral_op)
4681             {
4682               /* Build a vector of the neutral value and shift the
4683                  other elements into place.  */
4684               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4685                                                    neutral_op);
4686               int k = nunits;
4687               while (k > 0 && elts[k - 1] == neutral_op)
4688                 k -= 1;
4689               while (k > 0)
4690                 {
4691                   k -= 1;
4692                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4693                                        vector_type, init, elts[k]);
4694                 }
4695             }
4696           else
4697             {
4698               /* First time round, duplicate ELTS to fill the
4699                  required number of vectors.  */
4700               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4701                                         number_of_vectors, *vec_oprnds);
4702               break;
4703             }
4704           vec_oprnds->quick_push (init);
4705
4706           number_of_places_left_in_vector = nunits;
4707           elts.new_vector (vector_type, nunits, 1);
4708           elts.quick_grow (nunits);
4709           constant_p = true;
4710         }
4711     }
4712   if (ctor_seq != NULL)
4713     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4714 }
4715
4716 /* For a statement STMT_INFO taking part in a reduction operation return
4717    the stmt_vec_info the meta information is stored on.  */
4718
4719 stmt_vec_info
4720 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4721 {
4722   stmt_info = vect_orig_stmt (stmt_info);
4723   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4724   if (!is_a <gphi *> (stmt_info->stmt)
4725       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4726     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4727   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4728   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4729     {
4730       if (gimple_phi_num_args (phi) == 1)
4731         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4732     }
4733   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4734     {
4735       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4736       stmt_vec_info info
4737           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4738       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4739         stmt_info = info;
4740     }
4741   return stmt_info;
4742 }
4743
4744 /* Function vect_create_epilog_for_reduction
4745
4746    Create code at the loop-epilog to finalize the result of a reduction
4747    computation.
4748
4749    STMT_INFO is the scalar reduction stmt that is being vectorized.
4750    SLP_NODE is an SLP node containing a group of reduction statements. The
4751      first one in this group is STMT_INFO.
4752    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4753    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4754      (counting from 0)
4755
4756    This function:
4757    1. Completes the reduction def-use cycles.
4758    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4759       by calling the function specified by REDUC_FN if available, or by
4760       other means (whole-vector shifts or a scalar loop).
4761       The function also creates a new phi node at the loop exit to preserve
4762       loop-closed form, as illustrated below.
4763
4764      The flow at the entry to this function:
4765
4766         loop:
4767           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4768           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4769           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4770         loop_exit:
4771           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4772           use <s_out0>
4773           use <s_out0>
4774
4775      The above is transformed by this function into:
4776
4777         loop:
4778           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4779           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4780           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4781         loop_exit:
4782           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4783           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4784           v_out2 = reduce <v_out1>
4785           s_out3 = extract_field <v_out2, 0>
4786           s_out4 = adjust_result <s_out3>
4787           use <s_out4>
4788           use <s_out4>
4789 */
4790
4791 static void
4792 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4793                                   stmt_vec_info stmt_info,
4794                                   slp_tree slp_node,
4795                                   slp_instance slp_node_instance)
4796 {
4797   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4798   gcc_assert (reduc_info->is_reduc_info);
4799   /* For double reductions we need to get at the inner loop reduction
4800      stmt which has the meta info attached.  Our stmt_info is that of the
4801      loop-closed PHI of the inner loop which we remember as
4802      def for the reduction PHI generation.  */
4803   bool double_reduc = false;
4804   stmt_vec_info rdef_info = stmt_info;
4805   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4806     {
4807       gcc_assert (!slp_node);
4808       double_reduc = true;
4809       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4810                                             (stmt_info->stmt, 0));
4811       stmt_info = vect_stmt_to_vectorize (stmt_info);
4812     }
4813   gphi *reduc_def_stmt
4814     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4815   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4816   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4817   tree vectype;
4818   machine_mode mode;
4819   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4820   basic_block exit_bb;
4821   tree scalar_dest;
4822   tree scalar_type;
4823   gimple *new_phi = NULL, *phi;
4824   gimple_stmt_iterator exit_gsi;
4825   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4826   gimple *epilog_stmt = NULL;
4827   gimple *exit_phi;
4828   tree bitsize;
4829   tree def;
4830   tree orig_name, scalar_result;
4831   imm_use_iterator imm_iter, phi_imm_iter;
4832   use_operand_p use_p, phi_use_p;
4833   gimple *use_stmt;
4834   bool nested_in_vect_loop = false;
4835   auto_vec<gimple *> new_phis;
4836   int j, i;
4837   auto_vec<tree> scalar_results;
4838   unsigned int group_size = 1, k;
4839   auto_vec<gimple *> phis;
4840   bool slp_reduc = false;
4841   bool direct_slp_reduc;
4842   tree new_phi_result;
4843   tree induction_index = NULL_TREE;
4844
4845   if (slp_node)
4846     group_size = SLP_TREE_LANES (slp_node);
4847
4848   if (nested_in_vect_loop_p (loop, stmt_info))
4849     {
4850       outer_loop = loop;
4851       loop = loop->inner;
4852       nested_in_vect_loop = true;
4853       gcc_assert (!slp_node);
4854     }
4855   gcc_assert (!nested_in_vect_loop || double_reduc);
4856
4857   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4858   gcc_assert (vectype);
4859   mode = TYPE_MODE (vectype);
4860
4861   tree initial_def = NULL;
4862   tree induc_val = NULL_TREE;
4863   tree adjustment_def = NULL;
4864   if (slp_node)
4865     ;
4866   else
4867     {
4868       /* Get at the scalar def before the loop, that defines the initial value
4869          of the reduction variable.  */
4870       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4871                                            loop_preheader_edge (loop));
4872       /* Optimize: for induction condition reduction, if we can't use zero
4873          for induc_val, use initial_def.  */
4874       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4875         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4876       else if (double_reduc)
4877         ;
4878       else if (nested_in_vect_loop)
4879         ;
4880       else
4881         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4882     }
4883
4884   unsigned vec_num;
4885   int ncopies;
4886   if (slp_node)
4887     {
4888       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4889       ncopies = 1;
4890     }
4891   else
4892     {
4893       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4894       vec_num = 1;
4895       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4896     }
4897
4898   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4899      which is updated with the current index of the loop for every match of
4900      the original loop's cond_expr (VEC_STMT).  This results in a vector
4901      containing the last time the condition passed for that vector lane.
4902      The first match will be a 1 to allow 0 to be used for non-matching
4903      indexes.  If there are no matches at all then the vector will be all
4904      zeroes.
4905
4906      PR92772: This algorithm is broken for architectures that support
4907      masked vectors, but do not provide fold_extract_last.  */
4908   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4909     {
4910       auto_vec<std::pair<tree, bool>, 2> ccompares;
4911       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4912       cond_info = vect_stmt_to_vectorize (cond_info);
4913       while (cond_info != reduc_info)
4914         {
4915           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4916             {
4917               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4918               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4919               ccompares.safe_push
4920                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4921                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4922             }
4923           cond_info
4924             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4925                                                  1 + STMT_VINFO_REDUC_IDX
4926                                                         (cond_info)));
4927           cond_info = vect_stmt_to_vectorize (cond_info);
4928         }
4929       gcc_assert (ccompares.length () != 0);
4930
4931       tree indx_before_incr, indx_after_incr;
4932       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4933       int scalar_precision
4934         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4935       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4936       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4937         (TYPE_MODE (vectype), cr_index_scalar_type,
4938          TYPE_VECTOR_SUBPARTS (vectype));
4939
4940       /* First we create a simple vector induction variable which starts
4941          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4942          vector size (STEP).  */
4943
4944       /* Create a {1,2,3,...} vector.  */
4945       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4946
4947       /* Create a vector of the step value.  */
4948       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4949       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4950
4951       /* Create an induction variable.  */
4952       gimple_stmt_iterator incr_gsi;
4953       bool insert_after;
4954       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4955       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4956                  insert_after, &indx_before_incr, &indx_after_incr);
4957
4958       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4959          filled with zeros (VEC_ZERO).  */
4960
4961       /* Create a vector of 0s.  */
4962       tree zero = build_zero_cst (cr_index_scalar_type);
4963       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4964
4965       /* Create a vector phi node.  */
4966       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4967       new_phi = create_phi_node (new_phi_tree, loop->header);
4968       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4969                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4970
4971       /* Now take the condition from the loops original cond_exprs
4972          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4973          every match uses values from the induction variable
4974          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4975          (NEW_PHI_TREE).
4976          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4977          the new cond_expr (INDEX_COND_EXPR).  */
4978       gimple_seq stmts = NULL;
4979       for (int i = ccompares.length () - 1; i != -1; --i)
4980         {
4981           tree ccompare = ccompares[i].first;
4982           if (ccompares[i].second)
4983             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4984                                          cr_index_vector_type,
4985                                          ccompare,
4986                                          indx_before_incr, new_phi_tree);
4987           else
4988             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4989                                          cr_index_vector_type,
4990                                          ccompare,
4991                                          new_phi_tree, indx_before_incr);
4992         }
4993       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4994
4995       /* Update the phi with the vec cond.  */
4996       induction_index = new_phi_tree;
4997       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4998                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4999     }
5000
5001   /* 2. Create epilog code.
5002         The reduction epilog code operates across the elements of the vector
5003         of partial results computed by the vectorized loop.
5004         The reduction epilog code consists of:
5005
5006         step 1: compute the scalar result in a vector (v_out2)
5007         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5008         step 3: adjust the scalar result (s_out3) if needed.
5009
5010         Step 1 can be accomplished using one the following three schemes:
5011           (scheme 1) using reduc_fn, if available.
5012           (scheme 2) using whole-vector shifts, if available.
5013           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5014                      combined.
5015
5016           The overall epilog code looks like this:
5017
5018           s_out0 = phi <s_loop>         # original EXIT_PHI
5019           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5020           v_out2 = reduce <v_out1>              # step 1
5021           s_out3 = extract_field <v_out2, 0>    # step 2
5022           s_out4 = adjust_result <s_out3>       # step 3
5023
5024           (step 3 is optional, and steps 1 and 2 may be combined).
5025           Lastly, the uses of s_out0 are replaced by s_out4.  */
5026
5027
5028   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5029          v_out1 = phi <VECT_DEF>
5030          Store them in NEW_PHIS.  */
5031   if (double_reduc)
5032     loop = outer_loop;
5033   exit_bb = single_exit (loop)->dest;
5034   new_phis.create (slp_node ? vec_num : ncopies);
5035   for (unsigned i = 0; i < vec_num; i++)
5036     {
5037       if (slp_node)
5038         def = vect_get_slp_vect_def (slp_node, i);
5039       else
5040         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5041       for (j = 0; j < ncopies; j++)
5042         {
5043           tree new_def = copy_ssa_name (def);
5044           phi = create_phi_node (new_def, exit_bb);
5045           if (j == 0)
5046             new_phis.quick_push (phi);
5047           else
5048             {
5049               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5050               new_phis.quick_push (phi);
5051             }
5052
5053           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5054         }
5055     }
5056
5057   exit_gsi = gsi_after_labels (exit_bb);
5058
5059   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5060          (i.e. when reduc_fn is not available) and in the final adjustment
5061          code (if needed).  Also get the original scalar reduction variable as
5062          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5063          represents a reduction pattern), the tree-code and scalar-def are
5064          taken from the original stmt that the pattern-stmt (STMT) replaces.
5065          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5066          are taken from STMT.  */
5067
5068   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5069   if (orig_stmt_info != stmt_info)
5070     {
5071       /* Reduction pattern  */
5072       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5073       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5074     }
5075
5076   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5077   scalar_type = TREE_TYPE (scalar_dest);
5078   scalar_results.create (group_size);
5079   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5080   bitsize = TYPE_SIZE (scalar_type);
5081
5082   /* SLP reduction without reduction chain, e.g.,
5083      # a1 = phi <a2, a0>
5084      # b1 = phi <b2, b0>
5085      a2 = operation (a1)
5086      b2 = operation (b1)  */
5087   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5088
5089   /* True if we should implement SLP_REDUC using native reduction operations
5090      instead of scalar operations.  */
5091   direct_slp_reduc = (reduc_fn != IFN_LAST
5092                       && slp_reduc
5093                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5094
5095   /* In case of reduction chain, e.g.,
5096      # a1 = phi <a3, a0>
5097      a2 = operation (a1)
5098      a3 = operation (a2),
5099
5100      we may end up with more than one vector result.  Here we reduce them to
5101      one vector.  */
5102   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5103     {
5104       gimple_seq stmts = NULL;
5105       tree first_vect = PHI_RESULT (new_phis[0]);
5106       first_vect = gimple_convert (&stmts, vectype, first_vect);
5107       for (k = 1; k < new_phis.length (); k++)
5108         {
5109           gimple *next_phi = new_phis[k];
5110           tree second_vect = PHI_RESULT (next_phi);
5111           second_vect = gimple_convert (&stmts, vectype, second_vect);
5112           first_vect = gimple_build (&stmts, code, vectype,
5113                                      first_vect, second_vect);
5114         }
5115       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5116
5117       new_phi_result = first_vect;
5118       new_phis.truncate (0);
5119       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5120     }
5121   /* Likewise if we couldn't use a single defuse cycle.  */
5122   else if (ncopies > 1)
5123     {
5124       gimple_seq stmts = NULL;
5125       tree first_vect = PHI_RESULT (new_phis[0]);
5126       first_vect = gimple_convert (&stmts, vectype, first_vect);
5127       for (int k = 1; k < ncopies; ++k)
5128         {
5129           tree second_vect = PHI_RESULT (new_phis[k]);
5130           second_vect = gimple_convert (&stmts, vectype, second_vect);
5131           first_vect = gimple_build (&stmts, code, vectype,
5132                                      first_vect, second_vect);
5133         }
5134       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5135       new_phi_result = first_vect;
5136       new_phis.truncate (0);
5137       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5138     }
5139   else
5140     new_phi_result = PHI_RESULT (new_phis[0]);
5141
5142   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5143       && reduc_fn != IFN_LAST)
5144     {
5145       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5146          various data values where the condition matched and another vector
5147          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5148          need to extract the last matching index (which will be the index with
5149          highest value) and use this to index into the data vector.
5150          For the case where there were no matches, the data vector will contain
5151          all default values and the index vector will be all zeros.  */
5152
5153       /* Get various versions of the type of the vector of indexes.  */
5154       tree index_vec_type = TREE_TYPE (induction_index);
5155       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5156       tree index_scalar_type = TREE_TYPE (index_vec_type);
5157       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5158
5159       /* Get an unsigned integer version of the type of the data vector.  */
5160       int scalar_precision
5161         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5162       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5163       tree vectype_unsigned = build_vector_type
5164         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5165
5166       /* First we need to create a vector (ZERO_VEC) of zeros and another
5167          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5168          can create using a MAX reduction and then expanding.
5169          In the case where the loop never made any matches, the max index will
5170          be zero.  */
5171
5172       /* Vector of {0, 0, 0,...}.  */
5173       tree zero_vec = build_zero_cst (vectype);
5174
5175       gimple_seq stmts = NULL;
5176       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5177       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5178
5179       /* Find maximum value from the vector of found indexes.  */
5180       tree max_index = make_ssa_name (index_scalar_type);
5181       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5182                                                           1, induction_index);
5183       gimple_call_set_lhs (max_index_stmt, max_index);
5184       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5185
5186       /* Vector of {max_index, max_index, max_index,...}.  */
5187       tree max_index_vec = make_ssa_name (index_vec_type);
5188       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5189                                                       max_index);
5190       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5191                                                         max_index_vec_rhs);
5192       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5193
5194       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5195          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5196          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5197          otherwise.  Only one value should match, resulting in a vector
5198          (VEC_COND) with one data value and the rest zeros.
5199          In the case where the loop never made any matches, every index will
5200          match, resulting in a vector with all data values (which will all be
5201          the default value).  */
5202
5203       /* Compare the max index vector to the vector of found indexes to find
5204          the position of the max value.  */
5205       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5206       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5207                                                       induction_index,
5208                                                       max_index_vec);
5209       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5210
5211       /* Use the compare to choose either values from the data vector or
5212          zero.  */
5213       tree vec_cond = make_ssa_name (vectype);
5214       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5215                                                    vec_compare, new_phi_result,
5216                                                    zero_vec);
5217       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5218
5219       /* Finally we need to extract the data value from the vector (VEC_COND)
5220          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5221          reduction, but because this doesn't exist, we can use a MAX reduction
5222          instead.  The data value might be signed or a float so we need to cast
5223          it first.
5224          In the case where the loop never made any matches, the data values are
5225          all identical, and so will reduce down correctly.  */
5226
5227       /* Make the matched data values unsigned.  */
5228       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5229       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5230                                        vec_cond);
5231       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5232                                                         VIEW_CONVERT_EXPR,
5233                                                         vec_cond_cast_rhs);
5234       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5235
5236       /* Reduce down to a scalar value.  */
5237       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5238       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5239                                                            1, vec_cond_cast);
5240       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5241       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5242
5243       /* Convert the reduced value back to the result type and set as the
5244          result.  */
5245       stmts = NULL;
5246       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5247                                data_reduc);
5248       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5249       scalar_results.safe_push (new_temp);
5250     }
5251   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5252            && reduc_fn == IFN_LAST)
5253     {
5254       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5255          idx = 0;
5256          idx_val = induction_index[0];
5257          val = data_reduc[0];
5258          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5259            if (induction_index[i] > idx_val)
5260              val = data_reduc[i], idx_val = induction_index[i];
5261          return val;  */
5262
5263       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5264       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5265       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5266       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5267       /* Enforced by vectorizable_reduction, which ensures we have target
5268          support before allowing a conditional reduction on variable-length
5269          vectors.  */
5270       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5271       tree idx_val = NULL_TREE, val = NULL_TREE;
5272       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5273         {
5274           tree old_idx_val = idx_val;
5275           tree old_val = val;
5276           idx_val = make_ssa_name (idx_eltype);
5277           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5278                                              build3 (BIT_FIELD_REF, idx_eltype,
5279                                                      induction_index,
5280                                                      bitsize_int (el_size),
5281                                                      bitsize_int (off)));
5282           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283           val = make_ssa_name (data_eltype);
5284           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5285                                              build3 (BIT_FIELD_REF,
5286                                                      data_eltype,
5287                                                      new_phi_result,
5288                                                      bitsize_int (el_size),
5289                                                      bitsize_int (off)));
5290           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5291           if (off != 0)
5292             {
5293               tree new_idx_val = idx_val;
5294               if (off != v_size - el_size)
5295                 {
5296                   new_idx_val = make_ssa_name (idx_eltype);
5297                   epilog_stmt = gimple_build_assign (new_idx_val,
5298                                                      MAX_EXPR, idx_val,
5299                                                      old_idx_val);
5300                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5301                 }
5302               tree new_val = make_ssa_name (data_eltype);
5303               epilog_stmt = gimple_build_assign (new_val,
5304                                                  COND_EXPR,
5305                                                  build2 (GT_EXPR,
5306                                                          boolean_type_node,
5307                                                          idx_val,
5308                                                          old_idx_val),
5309                                                  val, old_val);
5310               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5311               idx_val = new_idx_val;
5312               val = new_val;
5313             }
5314         }
5315       /* Convert the reduced value back to the result type and set as the
5316          result.  */
5317       gimple_seq stmts = NULL;
5318       val = gimple_convert (&stmts, scalar_type, val);
5319       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5320       scalar_results.safe_push (val);
5321     }
5322
5323   /* 2.3 Create the reduction code, using one of the three schemes described
5324          above. In SLP we simply need to extract all the elements from the
5325          vector (without reducing them), so we use scalar shifts.  */
5326   else if (reduc_fn != IFN_LAST && !slp_reduc)
5327     {
5328       tree tmp;
5329       tree vec_elem_type;
5330
5331       /* Case 1:  Create:
5332          v_out2 = reduc_expr <v_out1>  */
5333
5334       if (dump_enabled_p ())
5335         dump_printf_loc (MSG_NOTE, vect_location,
5336                          "Reduce using direct vector reduction.\n");
5337
5338       gimple_seq stmts = NULL;
5339       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5340       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5341       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5342                                vec_elem_type, new_phi_result);
5343       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5344       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5345
5346       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5347           && induc_val)
5348         {
5349           /* Earlier we set the initial value to be a vector if induc_val
5350              values.  Check the result and if it is induc_val then replace
5351              with the original initial value, unless induc_val is
5352              the same as initial_def already.  */
5353           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5354                                   induc_val);
5355
5356           tmp = make_ssa_name (new_scalar_dest);
5357           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5358                                              initial_def, new_temp);
5359           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360           new_temp = tmp;
5361         }
5362
5363       scalar_results.safe_push (new_temp);
5364     }
5365   else if (direct_slp_reduc)
5366     {
5367       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5368          with the elements for other SLP statements replaced with the
5369          neutral value.  We can then do a normal reduction on each vector.  */
5370
5371       /* Enforced by vectorizable_reduction.  */
5372       gcc_assert (new_phis.length () == 1);
5373       gcc_assert (pow2p_hwi (group_size));
5374
5375       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5376       vec<stmt_vec_info> orig_phis
5377         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5378       gimple_seq seq = NULL;
5379
5380       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5381          and the same element size as VECTYPE.  */
5382       tree index = build_index_vector (vectype, 0, 1);
5383       tree index_type = TREE_TYPE (index);
5384       tree index_elt_type = TREE_TYPE (index_type);
5385       tree mask_type = truth_type_for (index_type);
5386
5387       /* Create a vector that, for each element, identifies which of
5388          the REDUC_GROUP_SIZE results should use it.  */
5389       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5390       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5391                             build_vector_from_val (index_type, index_mask));
5392
5393       /* Get a neutral vector value.  This is simply a splat of the neutral
5394          scalar value if we have one, otherwise the initial scalar value
5395          is itself a neutral value.  */
5396       tree vector_identity = NULL_TREE;
5397       tree neutral_op = NULL_TREE;
5398       if (slp_node)
5399         {
5400           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5401           neutral_op
5402             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5403                                             vectype, code, first != NULL);
5404         }
5405       if (neutral_op)
5406         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5407                                                         neutral_op);
5408       for (unsigned int i = 0; i < group_size; ++i)
5409         {
5410           /* If there's no univeral neutral value, we can use the
5411              initial scalar value from the original PHI.  This is used
5412              for MIN and MAX reduction, for example.  */
5413           if (!neutral_op)
5414             {
5415               tree scalar_value
5416                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5417                                          loop_preheader_edge (loop));
5418               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5419                                              scalar_value);
5420               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5421                                                               scalar_value);
5422             }
5423
5424           /* Calculate the equivalent of:
5425
5426              sel[j] = (index[j] == i);
5427
5428              which selects the elements of NEW_PHI_RESULT that should
5429              be included in the result.  */
5430           tree compare_val = build_int_cst (index_elt_type, i);
5431           compare_val = build_vector_from_val (index_type, compare_val);
5432           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5433                                    index, compare_val);
5434
5435           /* Calculate the equivalent of:
5436
5437              vec = seq ? new_phi_result : vector_identity;
5438
5439              VEC is now suitable for a full vector reduction.  */
5440           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5441                                    sel, new_phi_result, vector_identity);
5442
5443           /* Do the reduction and convert it to the appropriate type.  */
5444           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5445                                       TREE_TYPE (vectype), vec);
5446           scalar = gimple_convert (&seq, scalar_type, scalar);
5447           scalar_results.safe_push (scalar);
5448         }
5449       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5450     }
5451   else
5452     {
5453       bool reduce_with_shift;
5454       tree vec_temp;
5455
5456       gcc_assert (slp_reduc || new_phis.length () == 1);
5457
5458       /* See if the target wants to do the final (shift) reduction
5459          in a vector mode of smaller size and first reduce upper/lower
5460          halves against each other.  */
5461       enum machine_mode mode1 = mode;
5462       tree stype = TREE_TYPE (vectype);
5463       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5464       unsigned nunits1 = nunits;
5465       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5466           && new_phis.length () == 1)
5467         {
5468           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5469           /* For SLP reductions we have to make sure lanes match up, but
5470              since we're doing individual element final reduction reducing
5471              vector width here is even more important.
5472              ???  We can also separate lanes with permutes, for the common
5473              case of power-of-two group-size odd/even extracts would work.  */
5474           if (slp_reduc && nunits != nunits1)
5475             {
5476               nunits1 = least_common_multiple (nunits1, group_size);
5477               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5478             }
5479         }
5480       if (!slp_reduc
5481           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5482         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5483
5484       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5485                                                            stype, nunits1);
5486       reduce_with_shift = have_whole_vector_shift (mode1);
5487       if (!VECTOR_MODE_P (mode1))
5488         reduce_with_shift = false;
5489       else
5490         {
5491           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5492           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5493             reduce_with_shift = false;
5494         }
5495
5496       /* First reduce the vector to the desired vector size we should
5497          do shift reduction on by combining upper and lower halves.  */
5498       new_temp = new_phi_result;
5499       while (nunits > nunits1)
5500         {
5501           nunits /= 2;
5502           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5503                                                           stype, nunits);
5504           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5505
5506           /* The target has to make sure we support lowpart/highpart
5507              extraction, either via direct vector extract or through
5508              an integer mode punning.  */
5509           tree dst1, dst2;
5510           if (convert_optab_handler (vec_extract_optab,
5511                                      TYPE_MODE (TREE_TYPE (new_temp)),
5512                                      TYPE_MODE (vectype1))
5513               != CODE_FOR_nothing)
5514             {
5515               /* Extract sub-vectors directly once vec_extract becomes
5516                  a conversion optab.  */
5517               dst1 = make_ssa_name (vectype1);
5518               epilog_stmt
5519                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5520                                          build3 (BIT_FIELD_REF, vectype1,
5521                                                  new_temp, TYPE_SIZE (vectype1),
5522                                                  bitsize_int (0)));
5523               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5524               dst2 =  make_ssa_name (vectype1);
5525               epilog_stmt
5526                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5527                                          build3 (BIT_FIELD_REF, vectype1,
5528                                                  new_temp, TYPE_SIZE (vectype1),
5529                                                  bitsize_int (bitsize)));
5530               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5531             }
5532           else
5533             {
5534               /* Extract via punning to appropriately sized integer mode
5535                  vector.  */
5536               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5537               tree etype = build_vector_type (eltype, 2);
5538               gcc_assert (convert_optab_handler (vec_extract_optab,
5539                                                  TYPE_MODE (etype),
5540                                                  TYPE_MODE (eltype))
5541                           != CODE_FOR_nothing);
5542               tree tem = make_ssa_name (etype);
5543               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5544                                                  build1 (VIEW_CONVERT_EXPR,
5545                                                          etype, new_temp));
5546               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5547               new_temp = tem;
5548               tem = make_ssa_name (eltype);
5549               epilog_stmt
5550                   = gimple_build_assign (tem, BIT_FIELD_REF,
5551                                          build3 (BIT_FIELD_REF, eltype,
5552                                                  new_temp, TYPE_SIZE (eltype),
5553                                                  bitsize_int (0)));
5554               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5555               dst1 = make_ssa_name (vectype1);
5556               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5557                                                  build1 (VIEW_CONVERT_EXPR,
5558                                                          vectype1, tem));
5559               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5560               tem = make_ssa_name (eltype);
5561               epilog_stmt
5562                   = gimple_build_assign (tem, BIT_FIELD_REF,
5563                                          build3 (BIT_FIELD_REF, eltype,
5564                                                  new_temp, TYPE_SIZE (eltype),
5565                                                  bitsize_int (bitsize)));
5566               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5567               dst2 =  make_ssa_name (vectype1);
5568               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5569                                                  build1 (VIEW_CONVERT_EXPR,
5570                                                          vectype1, tem));
5571               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5572             }
5573
5574           new_temp = make_ssa_name (vectype1);
5575           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5576           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5577           new_phis[0] = epilog_stmt;
5578         }
5579
5580       if (reduce_with_shift && !slp_reduc)
5581         {
5582           int element_bitsize = tree_to_uhwi (bitsize);
5583           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5584              for variable-length vectors and also requires direct target support
5585              for loop reductions.  */
5586           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5587           int nelements = vec_size_in_bits / element_bitsize;
5588           vec_perm_builder sel;
5589           vec_perm_indices indices;
5590
5591           int elt_offset;
5592
5593           tree zero_vec = build_zero_cst (vectype1);
5594           /* Case 2: Create:
5595              for (offset = nelements/2; offset >= 1; offset/=2)
5596                 {
5597                   Create:  va' = vec_shift <va, offset>
5598                   Create:  va = vop <va, va'>
5599                 }  */
5600
5601           tree rhs;
5602
5603           if (dump_enabled_p ())
5604             dump_printf_loc (MSG_NOTE, vect_location,
5605                              "Reduce using vector shifts\n");
5606
5607           gimple_seq stmts = NULL;
5608           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5609           for (elt_offset = nelements / 2;
5610                elt_offset >= 1;
5611                elt_offset /= 2)
5612             {
5613               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5614               indices.new_vector (sel, 2, nelements);
5615               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5616               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5617                                        new_temp, zero_vec, mask);
5618               new_temp = gimple_build (&stmts, code,
5619                                        vectype1, new_name, new_temp);
5620             }
5621           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5622
5623           /* 2.4  Extract the final scalar result.  Create:
5624              s_out3 = extract_field <v_out2, bitpos>  */
5625
5626           if (dump_enabled_p ())
5627             dump_printf_loc (MSG_NOTE, vect_location,
5628                              "extract scalar result\n");
5629
5630           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5631                         bitsize, bitsize_zero_node);
5632           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5633           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5634           gimple_assign_set_lhs (epilog_stmt, new_temp);
5635           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5636           scalar_results.safe_push (new_temp);
5637         }
5638       else
5639         {
5640           /* Case 3: Create:
5641              s = extract_field <v_out2, 0>
5642              for (offset = element_size;
5643                   offset < vector_size;
5644                   offset += element_size;)
5645                {
5646                  Create:  s' = extract_field <v_out2, offset>
5647                  Create:  s = op <s, s'>  // For non SLP cases
5648                }  */
5649
5650           if (dump_enabled_p ())
5651             dump_printf_loc (MSG_NOTE, vect_location,
5652                              "Reduce using scalar code.\n");
5653
5654           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5655           int element_bitsize = tree_to_uhwi (bitsize);
5656           tree compute_type = TREE_TYPE (vectype);
5657           gimple_seq stmts = NULL;
5658           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5659             {
5660               int bit_offset;
5661               if (gimple_code (new_phi) == GIMPLE_PHI)
5662                 vec_temp = PHI_RESULT (new_phi);
5663               else
5664                 vec_temp = gimple_assign_lhs (new_phi);
5665               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5666                                        vec_temp, bitsize, bitsize_zero_node);
5667
5668               /* In SLP we don't need to apply reduction operation, so we just
5669                  collect s' values in SCALAR_RESULTS.  */
5670               if (slp_reduc)
5671                 scalar_results.safe_push (new_temp);
5672
5673               for (bit_offset = element_bitsize;
5674                    bit_offset < vec_size_in_bits;
5675                    bit_offset += element_bitsize)
5676                 {
5677                   tree bitpos = bitsize_int (bit_offset);
5678                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5679                                            compute_type, vec_temp,
5680                                            bitsize, bitpos);
5681                   if (slp_reduc)
5682                     {
5683                       /* In SLP we don't need to apply reduction operation, so
5684                          we just collect s' values in SCALAR_RESULTS.  */
5685                       new_temp = new_name;
5686                       scalar_results.safe_push (new_name);
5687                     }
5688                   else
5689                     new_temp = gimple_build (&stmts, code, compute_type,
5690                                              new_name, new_temp);
5691                 }
5692             }
5693
5694           /* The only case where we need to reduce scalar results in SLP, is
5695              unrolling.  If the size of SCALAR_RESULTS is greater than
5696              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5697              REDUC_GROUP_SIZE.  */
5698           if (slp_reduc)
5699             {
5700               tree res, first_res, new_res;
5701
5702               /* Reduce multiple scalar results in case of SLP unrolling.  */
5703               for (j = group_size; scalar_results.iterate (j, &res);
5704                    j++)
5705                 {
5706                   first_res = scalar_results[j % group_size];
5707                   new_res = gimple_build (&stmts, code, compute_type,
5708                                           first_res, res);
5709                   scalar_results[j % group_size] = new_res;
5710                 }
5711               for (k = 0; k < group_size; k++)
5712                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5713                                                     scalar_results[k]);
5714             }
5715           else
5716             {
5717               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5718               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5719               scalar_results.safe_push (new_temp);
5720             }
5721
5722           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5723         }
5724
5725       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5726           && induc_val)
5727         {
5728           /* Earlier we set the initial value to be a vector if induc_val
5729              values.  Check the result and if it is induc_val then replace
5730              with the original initial value, unless induc_val is
5731              the same as initial_def already.  */
5732           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5733                                   induc_val);
5734
5735           tree tmp = make_ssa_name (new_scalar_dest);
5736           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5737                                              initial_def, new_temp);
5738           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5739           scalar_results[0] = tmp;
5740         }
5741     }
5742
5743   /* 2.5 Adjust the final result by the initial value of the reduction
5744          variable. (When such adjustment is not needed, then
5745          'adjustment_def' is zero).  For example, if code is PLUS we create:
5746          new_temp = loop_exit_def + adjustment_def  */
5747
5748   if (adjustment_def)
5749     {
5750       gcc_assert (!slp_reduc);
5751       gimple_seq stmts = NULL;
5752       if (nested_in_vect_loop)
5753         {
5754           new_phi = new_phis[0];
5755           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5756           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5757           new_temp = gimple_build (&stmts, code, vectype,
5758                                    PHI_RESULT (new_phi), adjustment_def);
5759         }
5760       else
5761         {
5762           new_temp = scalar_results[0];
5763           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5764           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5765           new_temp = gimple_build (&stmts, code, scalar_type,
5766                                    new_temp, adjustment_def);
5767         }
5768
5769       epilog_stmt = gimple_seq_last_stmt (stmts);
5770       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5771       if (nested_in_vect_loop)
5772         {
5773           if (!double_reduc)
5774             scalar_results.quick_push (new_temp);
5775           else
5776             scalar_results[0] = new_temp;
5777         }
5778       else
5779         scalar_results[0] = new_temp;
5780
5781       new_phis[0] = epilog_stmt;
5782     }
5783
5784   if (double_reduc)
5785     loop = loop->inner;
5786
5787   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5788           phis with new adjusted scalar results, i.e., replace use <s_out0>
5789           with use <s_out4>.
5790
5791      Transform:
5792         loop_exit:
5793           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5794           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5795           v_out2 = reduce <v_out1>
5796           s_out3 = extract_field <v_out2, 0>
5797           s_out4 = adjust_result <s_out3>
5798           use <s_out0>
5799           use <s_out0>
5800
5801      into:
5802
5803         loop_exit:
5804           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5805           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5806           v_out2 = reduce <v_out1>
5807           s_out3 = extract_field <v_out2, 0>
5808           s_out4 = adjust_result <s_out3>
5809           use <s_out4>
5810           use <s_out4> */
5811
5812
5813   /* In SLP reduction chain we reduce vector results into one vector if
5814      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5815      LHS of the last stmt in the reduction chain, since we are looking for
5816      the loop exit phi node.  */
5817   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5818     {
5819       stmt_vec_info dest_stmt_info
5820         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5821       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5822       group_size = 1;
5823     }
5824
5825   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5826      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5827      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5828      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5829      correspond to the first vector stmt, etc.
5830      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5831   if (group_size > new_phis.length ())
5832     gcc_assert (!(group_size % new_phis.length ()));
5833
5834   for (k = 0; k < group_size; k++)
5835     {
5836       if (slp_reduc)
5837         {
5838           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5839
5840           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5841           /* SLP statements can't participate in patterns.  */
5842           gcc_assert (!orig_stmt_info);
5843           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5844         }
5845
5846       if (nested_in_vect_loop)
5847         {
5848           if (double_reduc)
5849             loop = outer_loop;
5850           else
5851             gcc_unreachable ();
5852         }
5853
5854       phis.create (3);
5855       /* Find the loop-closed-use at the loop exit of the original scalar
5856          result.  (The reduction result is expected to have two immediate uses,
5857          one at the latch block, and one at the loop exit).  For double
5858          reductions we are looking for exit phis of the outer loop.  */
5859       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5860         {
5861           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5862             {
5863               if (!is_gimple_debug (USE_STMT (use_p)))
5864                 phis.safe_push (USE_STMT (use_p));
5865             }
5866           else
5867             {
5868               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5869                 {
5870                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5871
5872                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5873                     {
5874                       if (!flow_bb_inside_loop_p (loop,
5875                                              gimple_bb (USE_STMT (phi_use_p)))
5876                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5877                         phis.safe_push (USE_STMT (phi_use_p));
5878                     }
5879                 }
5880             }
5881         }
5882
5883       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5884         {
5885           /* Replace the uses:  */
5886           orig_name = PHI_RESULT (exit_phi);
5887           scalar_result = scalar_results[k];
5888           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5889             {
5890               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5891                 SET_USE (use_p, scalar_result);
5892               update_stmt (use_stmt);
5893             }
5894         }
5895
5896       phis.release ();
5897     }
5898 }
5899
5900 /* Return a vector of type VECTYPE that is equal to the vector select
5901    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5902    before GSI.  */
5903
5904 static tree
5905 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5906                      tree vec, tree identity)
5907 {
5908   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5909   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5910                                           mask, vec, identity);
5911   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5912   return cond;
5913 }
5914
5915 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5916    order, starting with LHS.  Insert the extraction statements before GSI and
5917    associate the new scalar SSA names with variable SCALAR_DEST.
5918    Return the SSA name for the result.  */
5919
5920 static tree
5921 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5922                        tree_code code, tree lhs, tree vector_rhs)
5923 {
5924   tree vectype = TREE_TYPE (vector_rhs);
5925   tree scalar_type = TREE_TYPE (vectype);
5926   tree bitsize = TYPE_SIZE (scalar_type);
5927   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5928   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5929
5930   for (unsigned HOST_WIDE_INT bit_offset = 0;
5931        bit_offset < vec_size_in_bits;
5932        bit_offset += element_bitsize)
5933     {
5934       tree bitpos = bitsize_int (bit_offset);
5935       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5936                          bitsize, bitpos);
5937
5938       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5939       rhs = make_ssa_name (scalar_dest, stmt);
5940       gimple_assign_set_lhs (stmt, rhs);
5941       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5942
5943       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5944       tree new_name = make_ssa_name (scalar_dest, stmt);
5945       gimple_assign_set_lhs (stmt, new_name);
5946       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5947       lhs = new_name;
5948     }
5949   return lhs;
5950 }
5951
5952 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5953    type of the vector input.  */
5954
5955 static internal_fn
5956 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5957 {
5958   internal_fn mask_reduc_fn;
5959
5960   switch (reduc_fn)
5961     {
5962     case IFN_FOLD_LEFT_PLUS:
5963       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5964       break;
5965
5966     default:
5967       return IFN_LAST;
5968     }
5969
5970   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5971                                       OPTIMIZE_FOR_SPEED))
5972     return mask_reduc_fn;
5973   return IFN_LAST;
5974 }
5975
5976 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5977    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5978    statement.  CODE is the operation performed by STMT_INFO and OPS are
5979    its scalar operands.  REDUC_INDEX is the index of the operand in
5980    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5981    implements in-order reduction, or IFN_LAST if we should open-code it.
5982    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5983    that should be used to control the operation in a fully-masked loop.  */
5984
5985 static bool
5986 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5987                                stmt_vec_info stmt_info,
5988                                gimple_stmt_iterator *gsi,
5989                                gimple **vec_stmt, slp_tree slp_node,
5990                                gimple *reduc_def_stmt,
5991                                tree_code code, internal_fn reduc_fn,
5992                                tree ops[3], tree vectype_in,
5993                                int reduc_index, vec_loop_masks *masks)
5994 {
5995   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5996   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5997   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5998
5999   int ncopies;
6000   if (slp_node)
6001     ncopies = 1;
6002   else
6003     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6004
6005   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6006   gcc_assert (ncopies == 1);
6007   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6008
6009   if (slp_node)
6010     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6011                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6012
6013   tree op0 = ops[1 - reduc_index];
6014
6015   int group_size = 1;
6016   stmt_vec_info scalar_dest_def_info;
6017   auto_vec<tree> vec_oprnds0;
6018   if (slp_node)
6019     {
6020       auto_vec<vec<tree> > vec_defs (2);
6021       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6022       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6023       vec_defs[0].release ();
6024       vec_defs[1].release ();
6025       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6026       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6027     }
6028   else
6029     {
6030       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6031                                      op0, &vec_oprnds0);
6032       scalar_dest_def_info = stmt_info;
6033     }
6034
6035   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6036   tree scalar_type = TREE_TYPE (scalar_dest);
6037   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6038
6039   int vec_num = vec_oprnds0.length ();
6040   gcc_assert (vec_num == 1 || slp_node);
6041   tree vec_elem_type = TREE_TYPE (vectype_out);
6042   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6043
6044   tree vector_identity = NULL_TREE;
6045   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6046     vector_identity = build_zero_cst (vectype_out);
6047
6048   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6049   int i;
6050   tree def0;
6051   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6052     {
6053       gimple *new_stmt;
6054       tree mask = NULL_TREE;
6055       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6056         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6057
6058       /* Handle MINUS by adding the negative.  */
6059       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6060         {
6061           tree negated = make_ssa_name (vectype_out);
6062           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6063           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6064           def0 = negated;
6065         }
6066
6067       if (mask && mask_reduc_fn == IFN_LAST)
6068         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6069                                     vector_identity);
6070
6071       /* On the first iteration the input is simply the scalar phi
6072          result, and for subsequent iterations it is the output of
6073          the preceding operation.  */
6074       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6075         {
6076           if (mask && mask_reduc_fn != IFN_LAST)
6077             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6078                                                    def0, mask);
6079           else
6080             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6081                                                    def0);
6082           /* For chained SLP reductions the output of the previous reduction
6083              operation serves as the input of the next. For the final statement
6084              the output cannot be a temporary - we reuse the original
6085              scalar destination of the last statement.  */
6086           if (i != vec_num - 1)
6087             {
6088               gimple_set_lhs (new_stmt, scalar_dest_var);
6089               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6090               gimple_set_lhs (new_stmt, reduc_var);
6091             }
6092         }
6093       else
6094         {
6095           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6096                                              reduc_var, def0);
6097           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6098           /* Remove the statement, so that we can use the same code paths
6099              as for statements that we've just created.  */
6100           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6101           gsi_remove (&tmp_gsi, true);
6102         }
6103
6104       if (i == vec_num - 1)
6105         {
6106           gimple_set_lhs (new_stmt, scalar_dest);
6107           vect_finish_replace_stmt (loop_vinfo,
6108                                     scalar_dest_def_info,
6109                                     new_stmt);
6110         }
6111       else
6112         vect_finish_stmt_generation (loop_vinfo,
6113                                      scalar_dest_def_info,
6114                                      new_stmt, gsi);
6115
6116       if (slp_node)
6117         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6118       else
6119         {
6120           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6121           *vec_stmt = new_stmt;
6122         }
6123     }
6124
6125   return true;
6126 }
6127
6128 /* Function is_nonwrapping_integer_induction.
6129
6130    Check if STMT_VINO (which is part of loop LOOP) both increments and
6131    does not cause overflow.  */
6132
6133 static bool
6134 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6135 {
6136   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6137   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6138   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6139   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6140   widest_int ni, max_loop_value, lhs_max;
6141   wi::overflow_type overflow = wi::OVF_NONE;
6142
6143   /* Make sure the loop is integer based.  */
6144   if (TREE_CODE (base) != INTEGER_CST
6145       || TREE_CODE (step) != INTEGER_CST)
6146     return false;
6147
6148   /* Check that the max size of the loop will not wrap.  */
6149
6150   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6151     return true;
6152
6153   if (! max_stmt_executions (loop, &ni))
6154     return false;
6155
6156   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6157                             &overflow);
6158   if (overflow)
6159     return false;
6160
6161   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6162                             TYPE_SIGN (lhs_type), &overflow);
6163   if (overflow)
6164     return false;
6165
6166   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6167           <= TYPE_PRECISION (lhs_type));
6168 }
6169
6170 /* Check if masking can be supported by inserting a conditional expression.
6171    CODE is the code for the operation.  COND_FN is the conditional internal
6172    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6173 static bool
6174 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6175                          tree vectype_in)
6176 {
6177   if (cond_fn != IFN_LAST
6178       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6179                                          OPTIMIZE_FOR_SPEED))
6180     return false;
6181
6182   switch (code)
6183     {
6184     case DOT_PROD_EXPR:
6185     case SAD_EXPR:
6186       return true;
6187
6188     default:
6189       return false;
6190     }
6191 }
6192
6193 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6194    code for the operation.  VOP is the array of operands.  MASK is the loop
6195    mask.  GSI is a statement iterator used to place the new conditional
6196    expression.  */
6197 static void
6198 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6199                       gimple_stmt_iterator *gsi)
6200 {
6201   switch (code)
6202     {
6203     case DOT_PROD_EXPR:
6204       {
6205         tree vectype = TREE_TYPE (vop[1]);
6206         tree zero = build_zero_cst (vectype);
6207         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6208         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6209                                                mask, vop[1], zero);
6210         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6211         vop[1] = masked_op1;
6212         break;
6213       }
6214
6215     case SAD_EXPR:
6216       {
6217         tree vectype = TREE_TYPE (vop[1]);
6218         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6219         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6220                                                mask, vop[1], vop[0]);
6221         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6222         vop[1] = masked_op1;
6223         break;
6224       }
6225
6226     default:
6227       gcc_unreachable ();
6228     }
6229 }
6230
6231 /* Function vectorizable_reduction.
6232
6233    Check if STMT_INFO performs a reduction operation that can be vectorized.
6234    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6235    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6236    Return true if STMT_INFO is vectorizable in this way.
6237
6238    This function also handles reduction idioms (patterns) that have been
6239    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6240    may be of this form:
6241      X = pattern_expr (arg0, arg1, ..., X)
6242    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6243    sequence that had been detected and replaced by the pattern-stmt
6244    (STMT_INFO).
6245
6246    This function also handles reduction of condition expressions, for example:
6247      for (int i = 0; i < N; i++)
6248        if (a[i] < value)
6249          last = a[i];
6250    This is handled by vectorising the loop and creating an additional vector
6251    containing the loop indexes for which "a[i] < value" was true.  In the
6252    function epilogue this is reduced to a single max value and then used to
6253    index into the vector of results.
6254
6255    In some cases of reduction patterns, the type of the reduction variable X is
6256    different than the type of the other arguments of STMT_INFO.
6257    In such cases, the vectype that is used when transforming STMT_INFO into
6258    a vector stmt is different than the vectype that is used to determine the
6259    vectorization factor, because it consists of a different number of elements
6260    than the actual number of elements that are being operated upon in parallel.
6261
6262    For example, consider an accumulation of shorts into an int accumulator.
6263    On some targets it's possible to vectorize this pattern operating on 8
6264    shorts at a time (hence, the vectype for purposes of determining the
6265    vectorization factor should be V8HI); on the other hand, the vectype that
6266    is used to create the vector form is actually V4SI (the type of the result).
6267
6268    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6269    indicates what is the actual level of parallelism (V8HI in the example), so
6270    that the right vectorization factor would be derived.  This vectype
6271    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6272    be used to create the vectorized stmt.  The right vectype for the vectorized
6273    stmt is obtained from the type of the result X:
6274       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6275
6276    This means that, contrary to "regular" reductions (or "regular" stmts in
6277    general), the following equation:
6278       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6279    does *NOT* necessarily hold for reduction patterns.  */
6280
6281 bool
6282 vectorizable_reduction (loop_vec_info loop_vinfo,
6283                         stmt_vec_info stmt_info, slp_tree slp_node,
6284                         slp_instance slp_node_instance,
6285                         stmt_vector_for_cost *cost_vec)
6286 {
6287   tree scalar_dest;
6288   tree vectype_in = NULL_TREE;
6289   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6290   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6291   stmt_vec_info cond_stmt_vinfo = NULL;
6292   tree scalar_type;
6293   int i;
6294   int ncopies;
6295   bool single_defuse_cycle = false;
6296   bool nested_cycle = false;
6297   bool double_reduc = false;
6298   int vec_num;
6299   tree tem;
6300   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6301   tree cond_reduc_val = NULL_TREE;
6302
6303   /* Make sure it was already recognized as a reduction computation.  */
6304   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6305       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6306       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6307     return false;
6308
6309   /* The stmt we store reduction analysis meta on.  */
6310   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6311   reduc_info->is_reduc_info = true;
6312
6313   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6314     {
6315       if (is_a <gphi *> (stmt_info->stmt))
6316         /* Analysis for double-reduction is done on the outer
6317            loop PHI, nested cycles have no further restrictions.  */
6318         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6319       else
6320         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6321       return true;
6322     }
6323
6324   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6325   stmt_vec_info phi_info = stmt_info;
6326   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6327       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6328     {
6329       if (!is_a <gphi *> (stmt_info->stmt))
6330         {
6331           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6332           return true;
6333         }
6334       if (slp_node)
6335         {
6336           slp_node_instance->reduc_phis = slp_node;
6337           /* ???  We're leaving slp_node to point to the PHIs, we only
6338              need it to get at the number of vector stmts which wasn't
6339              yet initialized for the instance root.  */
6340         }
6341       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6342         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6343       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6344         {
6345           use_operand_p use_p;
6346           gimple *use_stmt;
6347           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6348                                      &use_p, &use_stmt);
6349           gcc_assert (res);
6350           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6351           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6352         }
6353     }
6354
6355   /* PHIs should not participate in patterns.  */
6356   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6357   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6358
6359   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6360      and compute the reduction chain length.  Discover the real
6361      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6362   tree reduc_def
6363     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6364                              loop_latch_edge
6365                                (gimple_bb (reduc_def_phi)->loop_father));
6366   unsigned reduc_chain_length = 0;
6367   bool only_slp_reduc_chain = true;
6368   stmt_info = NULL;
6369   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6370   while (reduc_def != PHI_RESULT (reduc_def_phi))
6371     {
6372       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6373       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6374       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6375         {
6376           if (dump_enabled_p ())
6377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6378                              "reduction chain broken by patterns.\n");
6379           return false;
6380         }
6381       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6382         only_slp_reduc_chain = false;
6383       /* ???  For epilogue generation live members of the chain need
6384          to point back to the PHI via their original stmt for
6385          info_for_reduction to work.  */
6386       if (STMT_VINFO_LIVE_P (vdef))
6387         STMT_VINFO_REDUC_DEF (def) = phi_info;
6388       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6389       if (!assign)
6390         {
6391           if (dump_enabled_p ())
6392             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6393                              "reduction chain includes calls.\n");
6394           return false;
6395         }
6396       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6397         {
6398           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6399                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6400             {
6401               if (dump_enabled_p ())
6402                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6403                                  "conversion in the reduction chain.\n");
6404               return false;
6405             }
6406         }
6407       else if (!stmt_info)
6408         /* First non-conversion stmt.  */
6409         stmt_info = vdef;
6410       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6411       reduc_chain_length++;
6412       if (!stmt_info && slp_node)
6413         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6414     }
6415   /* PHIs should not participate in patterns.  */
6416   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6417
6418   if (nested_in_vect_loop_p (loop, stmt_info))
6419     {
6420       loop = loop->inner;
6421       nested_cycle = true;
6422     }
6423
6424   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6425      element.  */
6426   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6427     {
6428       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6429       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6430     }
6431   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6432     gcc_assert (slp_node
6433                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6434
6435   /* 1. Is vectorizable reduction?  */
6436   /* Not supportable if the reduction variable is used in the loop, unless
6437      it's a reduction chain.  */
6438   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6439       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6440     return false;
6441
6442   /* Reductions that are not used even in an enclosing outer-loop,
6443      are expected to be "live" (used out of the loop).  */
6444   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6445       && !STMT_VINFO_LIVE_P (stmt_info))
6446     return false;
6447
6448   /* 2. Has this been recognized as a reduction pattern?
6449
6450      Check if STMT represents a pattern that has been recognized
6451      in earlier analysis stages.  For stmts that represent a pattern,
6452      the STMT_VINFO_RELATED_STMT field records the last stmt in
6453      the original sequence that constitutes the pattern.  */
6454
6455   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6456   if (orig_stmt_info)
6457     {
6458       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6459       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6460     }
6461
6462   /* 3. Check the operands of the operation.  The first operands are defined
6463         inside the loop body. The last operand is the reduction variable,
6464         which is defined by the loop-header-phi.  */
6465
6466   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6467   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6468   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6469   enum tree_code code = gimple_assign_rhs_code (stmt);
6470   bool lane_reduc_code_p
6471     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6472   int op_type = TREE_CODE_LENGTH (code);
6473
6474   scalar_dest = gimple_assign_lhs (stmt);
6475   scalar_type = TREE_TYPE (scalar_dest);
6476   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6477       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6478     return false;
6479
6480   /* Do not try to vectorize bit-precision reductions.  */
6481   if (!type_has_mode_precision_p (scalar_type))
6482     return false;
6483
6484   /* For lane-reducing ops we're reducing the number of reduction PHIs
6485      which means the only use of that may be in the lane-reducing operation.  */
6486   if (lane_reduc_code_p
6487       && reduc_chain_length != 1
6488       && !only_slp_reduc_chain)
6489     {
6490       if (dump_enabled_p ())
6491         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492                          "lane-reducing reduction with extra stmts.\n");
6493       return false;
6494     }
6495
6496   /* All uses but the last are expected to be defined in the loop.
6497      The last use is the reduction variable.  In case of nested cycle this
6498      assumption is not true: we use reduc_index to record the index of the
6499      reduction variable.  */
6500   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6501   /* We need to skip an extra operand for COND_EXPRs with embedded
6502      comparison.  */
6503   unsigned opno_adjust = 0;
6504   if (code == COND_EXPR
6505       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6506     opno_adjust = 1;
6507   for (i = 0; i < op_type; i++)
6508     {
6509       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6510       if (i == 0 && code == COND_EXPR)
6511         continue;
6512
6513       stmt_vec_info def_stmt_info;
6514       enum vect_def_type dt;
6515       tree op;
6516       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6517                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6518                                &def_stmt_info))
6519         {
6520           if (dump_enabled_p ())
6521             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522                              "use not simple.\n");
6523           return false;
6524         }
6525       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6526         continue;
6527
6528       /* There should be only one cycle def in the stmt, the one
6529          leading to reduc_def.  */
6530       if (VECTORIZABLE_CYCLE_DEF (dt))
6531         return false;
6532
6533       /* To properly compute ncopies we are interested in the widest
6534          non-reduction input type in case we're looking at a widening
6535          accumulation that we later handle in vect_transform_reduction.  */
6536       if (lane_reduc_code_p
6537           && tem
6538           && (!vectype_in
6539               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6540                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6541         vectype_in = tem;
6542
6543       if (code == COND_EXPR)
6544         {
6545           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6546           if (dt == vect_constant_def)
6547             {
6548               cond_reduc_dt = dt;
6549               cond_reduc_val = op;
6550             }
6551           if (dt == vect_induction_def
6552               && def_stmt_info
6553               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6554             {
6555               cond_reduc_dt = dt;
6556               cond_stmt_vinfo = def_stmt_info;
6557             }
6558         }
6559     }
6560   if (!vectype_in)
6561     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6562   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6563
6564   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6565   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6566   /* If we have a condition reduction, see if we can simplify it further.  */
6567   if (v_reduc_type == COND_REDUCTION)
6568     {
6569       if (slp_node)
6570         return false;
6571
6572       /* When the condition uses the reduction value in the condition, fail.  */
6573       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6574         {
6575           if (dump_enabled_p ())
6576             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6577                              "condition depends on previous iteration\n");
6578           return false;
6579         }
6580
6581       if (reduc_chain_length == 1
6582           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6583                                              vectype_in, OPTIMIZE_FOR_SPEED))
6584         {
6585           if (dump_enabled_p ())
6586             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6587                              "optimizing condition reduction with"
6588                              " FOLD_EXTRACT_LAST.\n");
6589           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6590         }
6591       else if (cond_reduc_dt == vect_induction_def)
6592         {
6593           tree base
6594             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6595           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6596
6597           gcc_assert (TREE_CODE (base) == INTEGER_CST
6598                       && TREE_CODE (step) == INTEGER_CST);
6599           cond_reduc_val = NULL_TREE;
6600           enum tree_code cond_reduc_op_code = ERROR_MARK;
6601           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6602           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6603             ;
6604           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6605              above base; punt if base is the minimum value of the type for
6606              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6607           else if (tree_int_cst_sgn (step) == -1)
6608             {
6609               cond_reduc_op_code = MIN_EXPR;
6610               if (tree_int_cst_sgn (base) == -1)
6611                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6612               else if (tree_int_cst_lt (base,
6613                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6614                 cond_reduc_val
6615                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6616             }
6617           else
6618             {
6619               cond_reduc_op_code = MAX_EXPR;
6620               if (tree_int_cst_sgn (base) == 1)
6621                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6622               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6623                                         base))
6624                 cond_reduc_val
6625                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6626             }
6627           if (cond_reduc_val)
6628             {
6629               if (dump_enabled_p ())
6630                 dump_printf_loc (MSG_NOTE, vect_location,
6631                                  "condition expression based on "
6632                                  "integer induction.\n");
6633               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6634               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6635                 = cond_reduc_val;
6636               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6637             }
6638         }
6639       else if (cond_reduc_dt == vect_constant_def)
6640         {
6641           enum vect_def_type cond_initial_dt;
6642           tree cond_initial_val
6643             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6644
6645           gcc_assert (cond_reduc_val != NULL_TREE);
6646           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6647           if (cond_initial_dt == vect_constant_def
6648               && types_compatible_p (TREE_TYPE (cond_initial_val),
6649                                      TREE_TYPE (cond_reduc_val)))
6650             {
6651               tree e = fold_binary (LE_EXPR, boolean_type_node,
6652                                     cond_initial_val, cond_reduc_val);
6653               if (e && (integer_onep (e) || integer_zerop (e)))
6654                 {
6655                   if (dump_enabled_p ())
6656                     dump_printf_loc (MSG_NOTE, vect_location,
6657                                      "condition expression based on "
6658                                      "compile time constant.\n");
6659                   /* Record reduction code at analysis stage.  */
6660                   STMT_VINFO_REDUC_CODE (reduc_info)
6661                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6662                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6663                 }
6664             }
6665         }
6666     }
6667
6668   if (STMT_VINFO_LIVE_P (phi_info))
6669     return false;
6670
6671   if (slp_node)
6672     ncopies = 1;
6673   else
6674     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6675
6676   gcc_assert (ncopies >= 1);
6677
6678   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6679
6680   if (nested_cycle)
6681     {
6682       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6683                   == vect_double_reduction_def);
6684       double_reduc = true;
6685     }
6686
6687   /* 4.2. Check support for the epilog operation.
6688
6689           If STMT represents a reduction pattern, then the type of the
6690           reduction variable may be different than the type of the rest
6691           of the arguments.  For example, consider the case of accumulation
6692           of shorts into an int accumulator; The original code:
6693                         S1: int_a = (int) short_a;
6694           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6695
6696           was replaced with:
6697                         STMT: int_acc = widen_sum <short_a, int_acc>
6698
6699           This means that:
6700           1. The tree-code that is used to create the vector operation in the
6701              epilog code (that reduces the partial results) is not the
6702              tree-code of STMT, but is rather the tree-code of the original
6703              stmt from the pattern that STMT is replacing.  I.e, in the example
6704              above we want to use 'widen_sum' in the loop, but 'plus' in the
6705              epilog.
6706           2. The type (mode) we use to check available target support
6707              for the vector operation to be created in the *epilog*, is
6708              determined by the type of the reduction variable (in the example
6709              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6710              However the type (mode) we use to check available target support
6711              for the vector operation to be created *inside the loop*, is
6712              determined by the type of the other arguments to STMT (in the
6713              example we'd check this: optab_handler (widen_sum_optab,
6714              vect_short_mode)).
6715
6716           This is contrary to "regular" reductions, in which the types of all
6717           the arguments are the same as the type of the reduction variable.
6718           For "regular" reductions we can therefore use the same vector type
6719           (and also the same tree-code) when generating the epilog code and
6720           when generating the code inside the loop.  */
6721
6722   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6723   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6724
6725   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6726   if (reduction_type == TREE_CODE_REDUCTION)
6727     {
6728       /* Check whether it's ok to change the order of the computation.
6729          Generally, when vectorizing a reduction we change the order of the
6730          computation.  This may change the behavior of the program in some
6731          cases, so we need to check that this is ok.  One exception is when
6732          vectorizing an outer-loop: the inner-loop is executed sequentially,
6733          and therefore vectorizing reductions in the inner-loop during
6734          outer-loop vectorization is safe.  */
6735       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6736         {
6737           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6738              is not directy used in stmt.  */
6739           if (!only_slp_reduc_chain
6740               && reduc_chain_length != 1)
6741             {
6742               if (dump_enabled_p ())
6743                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744                                  "in-order reduction chain without SLP.\n");
6745               return false;
6746             }
6747           STMT_VINFO_REDUC_TYPE (reduc_info)
6748             = reduction_type = FOLD_LEFT_REDUCTION;
6749         }
6750       else if (!commutative_tree_code (orig_code)
6751                || !associative_tree_code (orig_code))
6752         {
6753           if (dump_enabled_p ())
6754             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755                             "reduction: not commutative/associative");
6756           return false;
6757         }
6758     }
6759
6760   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6761       && ncopies > 1)
6762     {
6763       if (dump_enabled_p ())
6764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765                          "multiple types in double reduction or condition "
6766                          "reduction or fold-left reduction.\n");
6767       return false;
6768     }
6769
6770   internal_fn reduc_fn = IFN_LAST;
6771   if (reduction_type == TREE_CODE_REDUCTION
6772       || reduction_type == FOLD_LEFT_REDUCTION
6773       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6774       || reduction_type == CONST_COND_REDUCTION)
6775     {
6776       if (reduction_type == FOLD_LEFT_REDUCTION
6777           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6778           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6779         {
6780           if (reduc_fn != IFN_LAST
6781               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6782                                                   OPTIMIZE_FOR_SPEED))
6783             {
6784               if (dump_enabled_p ())
6785                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786                                  "reduc op not supported by target.\n");
6787
6788               reduc_fn = IFN_LAST;
6789             }
6790         }
6791       else
6792         {
6793           if (!nested_cycle || double_reduc)
6794             {
6795               if (dump_enabled_p ())
6796                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797                                  "no reduc code for scalar code.\n");
6798
6799               return false;
6800             }
6801         }
6802     }
6803   else if (reduction_type == COND_REDUCTION)
6804     {
6805       int scalar_precision
6806         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6807       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6808       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6809                                                 nunits_out);
6810
6811       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6812                                           OPTIMIZE_FOR_SPEED))
6813         reduc_fn = IFN_REDUC_MAX;
6814     }
6815   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6816
6817   if (reduction_type != EXTRACT_LAST_REDUCTION
6818       && (!nested_cycle || double_reduc)
6819       && reduc_fn == IFN_LAST
6820       && !nunits_out.is_constant ())
6821     {
6822       if (dump_enabled_p ())
6823         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6824                          "missing target support for reduction on"
6825                          " variable-length vectors.\n");
6826       return false;
6827     }
6828
6829   /* For SLP reductions, see if there is a neutral value we can use.  */
6830   tree neutral_op = NULL_TREE;
6831   if (slp_node)
6832     neutral_op = neutral_op_for_slp_reduction
6833       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6834        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6835
6836   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6837     {
6838       /* We can't support in-order reductions of code such as this:
6839
6840            for (int i = 0; i < n1; ++i)
6841              for (int j = 0; j < n2; ++j)
6842                l += a[j];
6843
6844          since GCC effectively transforms the loop when vectorizing:
6845
6846            for (int i = 0; i < n1 / VF; ++i)
6847              for (int j = 0; j < n2; ++j)
6848                for (int k = 0; k < VF; ++k)
6849                  l += a[j];
6850
6851          which is a reassociation of the original operation.  */
6852       if (dump_enabled_p ())
6853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854                          "in-order double reduction not supported.\n");
6855
6856       return false;
6857     }
6858
6859   if (reduction_type == FOLD_LEFT_REDUCTION
6860       && slp_node
6861       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6862     {
6863       /* We cannot use in-order reductions in this case because there is
6864          an implicit reassociation of the operations involved.  */
6865       if (dump_enabled_p ())
6866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867                          "in-order unchained SLP reductions not supported.\n");
6868       return false;
6869     }
6870
6871   /* For double reductions, and for SLP reductions with a neutral value,
6872      we construct a variable-length initial vector by loading a vector
6873      full of the neutral value and then shift-and-inserting the start
6874      values into the low-numbered elements.  */
6875   if ((double_reduc || neutral_op)
6876       && !nunits_out.is_constant ()
6877       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6878                                           vectype_out, OPTIMIZE_FOR_SPEED))
6879     {
6880       if (dump_enabled_p ())
6881         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882                          "reduction on variable-length vectors requires"
6883                          " target support for a vector-shift-and-insert"
6884                          " operation.\n");
6885       return false;
6886     }
6887
6888   /* Check extra constraints for variable-length unchained SLP reductions.  */
6889   if (STMT_SLP_TYPE (stmt_info)
6890       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6891       && !nunits_out.is_constant ())
6892     {
6893       /* We checked above that we could build the initial vector when
6894          there's a neutral element value.  Check here for the case in
6895          which each SLP statement has its own initial value and in which
6896          that value needs to be repeated for every instance of the
6897          statement within the initial vector.  */
6898       unsigned int group_size = SLP_TREE_LANES (slp_node);
6899       if (!neutral_op
6900           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6901                                               TREE_TYPE (vectype_out)))
6902         {
6903           if (dump_enabled_p ())
6904             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6905                              "unsupported form of SLP reduction for"
6906                              " variable-length vectors: cannot build"
6907                              " initial vector.\n");
6908           return false;
6909         }
6910       /* The epilogue code relies on the number of elements being a multiple
6911          of the group size.  The duplicate-and-interleave approach to setting
6912          up the initial vector does too.  */
6913       if (!multiple_p (nunits_out, group_size))
6914         {
6915           if (dump_enabled_p ())
6916             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917                              "unsupported form of SLP reduction for"
6918                              " variable-length vectors: the vector size"
6919                              " is not a multiple of the number of results.\n");
6920           return false;
6921         }
6922     }
6923
6924   if (reduction_type == COND_REDUCTION)
6925     {
6926       widest_int ni;
6927
6928       if (! max_loop_iterations (loop, &ni))
6929         {
6930           if (dump_enabled_p ())
6931             dump_printf_loc (MSG_NOTE, vect_location,
6932                              "loop count not known, cannot create cond "
6933                              "reduction.\n");
6934           return false;
6935         }
6936       /* Convert backedges to iterations.  */
6937       ni += 1;
6938
6939       /* The additional index will be the same type as the condition.  Check
6940          that the loop can fit into this less one (because we'll use up the
6941          zero slot for when there are no matches).  */
6942       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6943       if (wi::geu_p (ni, wi::to_widest (max_index)))
6944         {
6945           if (dump_enabled_p ())
6946             dump_printf_loc (MSG_NOTE, vect_location,
6947                              "loop size is greater than data size.\n");
6948           return false;
6949         }
6950     }
6951
6952   /* In case the vectorization factor (VF) is bigger than the number
6953      of elements that we can fit in a vectype (nunits), we have to generate
6954      more than one vector stmt - i.e - we need to "unroll" the
6955      vector stmt by a factor VF/nunits.  For more details see documentation
6956      in vectorizable_operation.  */
6957
6958   /* If the reduction is used in an outer loop we need to generate
6959      VF intermediate results, like so (e.g. for ncopies=2):
6960         r0 = phi (init, r0)
6961         r1 = phi (init, r1)
6962         r0 = x0 + r0;
6963         r1 = x1 + r1;
6964     (i.e. we generate VF results in 2 registers).
6965     In this case we have a separate def-use cycle for each copy, and therefore
6966     for each copy we get the vector def for the reduction variable from the
6967     respective phi node created for this copy.
6968
6969     Otherwise (the reduction is unused in the loop nest), we can combine
6970     together intermediate results, like so (e.g. for ncopies=2):
6971         r = phi (init, r)
6972         r = x0 + r;
6973         r = x1 + r;
6974    (i.e. we generate VF/2 results in a single register).
6975    In this case for each copy we get the vector def for the reduction variable
6976    from the vectorized reduction operation generated in the previous iteration.
6977
6978    This only works when we see both the reduction PHI and its only consumer
6979    in vectorizable_reduction and there are no intermediate stmts
6980    participating.  */
6981   if (ncopies > 1
6982       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6983       && reduc_chain_length == 1)
6984     single_defuse_cycle = true;
6985
6986   if (single_defuse_cycle || lane_reduc_code_p)
6987     {
6988       gcc_assert (code != COND_EXPR);
6989
6990       /* 4. Supportable by target?  */
6991       bool ok = true;
6992
6993       /* 4.1. check support for the operation in the loop  */
6994       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6995       if (!optab)
6996         {
6997           if (dump_enabled_p ())
6998             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6999                              "no optab.\n");
7000           ok = false;
7001         }
7002
7003       machine_mode vec_mode = TYPE_MODE (vectype_in);
7004       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7005         {
7006           if (dump_enabled_p ())
7007             dump_printf (MSG_NOTE, "op not supported by target.\n");
7008           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7009               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7010             ok = false;
7011           else
7012             if (dump_enabled_p ())
7013               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7014         }
7015
7016       /* Worthwhile without SIMD support?  */
7017       if (ok
7018           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7019           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7020         {
7021           if (dump_enabled_p ())
7022             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7023                              "not worthwhile without SIMD support.\n");
7024           ok = false;
7025         }
7026
7027       /* lane-reducing operations have to go through vect_transform_reduction.
7028          For the other cases try without the single cycle optimization.  */
7029       if (!ok)
7030         {
7031           if (lane_reduc_code_p)
7032             return false;
7033           else
7034             single_defuse_cycle = false;
7035         }
7036     }
7037   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7038
7039   /* If the reduction stmt is one of the patterns that have lane
7040      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7041   if ((ncopies > 1 && ! single_defuse_cycle)
7042       && lane_reduc_code_p)
7043     {
7044       if (dump_enabled_p ())
7045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046                          "multi def-use cycle not possible for lane-reducing "
7047                          "reduction operation\n");
7048       return false;
7049     }
7050
7051   if (slp_node
7052       && !(!single_defuse_cycle
7053            && code != DOT_PROD_EXPR
7054            && code != WIDEN_SUM_EXPR
7055            && code != SAD_EXPR
7056            && reduction_type != FOLD_LEFT_REDUCTION))
7057     for (i = 0; i < op_type; i++)
7058       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7059         {
7060           if (dump_enabled_p ())
7061             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7062                              "incompatible vector types for invariants\n");
7063           return false;
7064         }
7065
7066   if (slp_node)
7067     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7068   else
7069     vec_num = 1;
7070
7071   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7072                              reduction_type, ncopies, cost_vec);
7073   if (dump_enabled_p ()
7074       && reduction_type == FOLD_LEFT_REDUCTION)
7075     dump_printf_loc (MSG_NOTE, vect_location,
7076                      "using an in-order (fold-left) reduction.\n");
7077   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7078   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7079      reductions go through their own vectorizable_* routines.  */
7080   if (!single_defuse_cycle
7081       && code != DOT_PROD_EXPR
7082       && code != WIDEN_SUM_EXPR
7083       && code != SAD_EXPR
7084       && reduction_type != FOLD_LEFT_REDUCTION)
7085     {
7086       stmt_vec_info tem
7087         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7088       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7089         {
7090           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7091           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7092         }
7093       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7094       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7095     }
7096   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7097     {
7098       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7099       internal_fn cond_fn = get_conditional_internal_fn (code);
7100
7101       if (reduction_type != FOLD_LEFT_REDUCTION
7102           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7103           && (cond_fn == IFN_LAST
7104               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7105                                                   OPTIMIZE_FOR_SPEED)))
7106         {
7107           if (dump_enabled_p ())
7108             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109                              "can't operate on partial vectors because"
7110                              " no conditional operation is available.\n");
7111           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7112         }
7113       else if (reduction_type == FOLD_LEFT_REDUCTION
7114                && reduc_fn == IFN_LAST
7115                && !expand_vec_cond_expr_p (vectype_in,
7116                                            truth_type_for (vectype_in),
7117                                            SSA_NAME))
7118         {
7119           if (dump_enabled_p ())
7120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7121                              "can't operate on partial vectors because"
7122                              " no conditional operation is available.\n");
7123           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7124         }
7125       else
7126         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7127                                vectype_in, NULL);
7128     }
7129   return true;
7130 }
7131
7132 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7133    value.  */
7134
7135 bool
7136 vect_transform_reduction (loop_vec_info loop_vinfo,
7137                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7138                           gimple **vec_stmt, slp_tree slp_node)
7139 {
7140   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7141   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7142   int i;
7143   int ncopies;
7144   int vec_num;
7145
7146   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7147   gcc_assert (reduc_info->is_reduc_info);
7148
7149   if (nested_in_vect_loop_p (loop, stmt_info))
7150     {
7151       loop = loop->inner;
7152       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7153     }
7154
7155   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7156   enum tree_code code = gimple_assign_rhs_code (stmt);
7157   int op_type = TREE_CODE_LENGTH (code);
7158
7159   /* Flatten RHS.  */
7160   tree ops[3];
7161   switch (get_gimple_rhs_class (code))
7162     {
7163     case GIMPLE_TERNARY_RHS:
7164       ops[2] = gimple_assign_rhs3 (stmt);
7165       /* Fall thru.  */
7166     case GIMPLE_BINARY_RHS:
7167       ops[0] = gimple_assign_rhs1 (stmt);
7168       ops[1] = gimple_assign_rhs2 (stmt);
7169       break;
7170     default:
7171       gcc_unreachable ();
7172     }
7173
7174   /* All uses but the last are expected to be defined in the loop.
7175      The last use is the reduction variable.  In case of nested cycle this
7176      assumption is not true: we use reduc_index to record the index of the
7177      reduction variable.  */
7178   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7179   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7180   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7181   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7182
7183   if (slp_node)
7184     {
7185       ncopies = 1;
7186       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7187     }
7188   else
7189     {
7190       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7191       vec_num = 1;
7192     }
7193
7194   internal_fn cond_fn = get_conditional_internal_fn (code);
7195   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7196   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7197
7198   /* Transform.  */
7199   tree new_temp = NULL_TREE;
7200   auto_vec<tree> vec_oprnds0;
7201   auto_vec<tree> vec_oprnds1;
7202   auto_vec<tree> vec_oprnds2;
7203   tree def0;
7204
7205   if (dump_enabled_p ())
7206     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7207
7208   /* FORNOW: Multiple types are not supported for condition.  */
7209   if (code == COND_EXPR)
7210     gcc_assert (ncopies == 1);
7211
7212   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7213
7214   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7215   if (reduction_type == FOLD_LEFT_REDUCTION)
7216     {
7217       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7218       return vectorize_fold_left_reduction
7219           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7220            reduc_fn, ops, vectype_in, reduc_index, masks);
7221     }
7222
7223   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7224   gcc_assert (single_defuse_cycle
7225               || code == DOT_PROD_EXPR
7226               || code == WIDEN_SUM_EXPR
7227               || code == SAD_EXPR);
7228
7229   /* Create the destination vector  */
7230   tree scalar_dest = gimple_assign_lhs (stmt);
7231   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7232
7233   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7234                      single_defuse_cycle && reduc_index == 0
7235                      ? NULL_TREE : ops[0], &vec_oprnds0,
7236                      single_defuse_cycle && reduc_index == 1
7237                      ? NULL_TREE : ops[1], &vec_oprnds1,
7238                      op_type == ternary_op
7239                      && !(single_defuse_cycle && reduc_index == 2)
7240                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7241   if (single_defuse_cycle)
7242     {
7243       gcc_assert (!slp_node);
7244       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7245                                      ops[reduc_index],
7246                                      reduc_index == 0 ? &vec_oprnds0
7247                                      : (reduc_index == 1 ? &vec_oprnds1
7248                                         : &vec_oprnds2));
7249     }
7250
7251   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7252     {
7253       gimple *new_stmt;
7254       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7255       if (masked_loop_p && !mask_by_cond_expr)
7256         {
7257           /* Make sure that the reduction accumulator is vop[0].  */
7258           if (reduc_index == 1)
7259             {
7260               gcc_assert (commutative_tree_code (code));
7261               std::swap (vop[0], vop[1]);
7262             }
7263           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7264                                           vectype_in, i);
7265           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7266                                                     vop[0], vop[1], vop[0]);
7267           new_temp = make_ssa_name (vec_dest, call);
7268           gimple_call_set_lhs (call, new_temp);
7269           gimple_call_set_nothrow (call, true);
7270           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7271           new_stmt = call;
7272         }
7273       else
7274         {
7275           if (op_type == ternary_op)
7276             vop[2] = vec_oprnds2[i];
7277
7278           if (masked_loop_p && mask_by_cond_expr)
7279             {
7280               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7281                                               vectype_in, i);
7282               build_vect_cond_expr (code, vop, mask, gsi);
7283             }
7284
7285           new_stmt = gimple_build_assign (vec_dest, code,
7286                                           vop[0], vop[1], vop[2]);
7287           new_temp = make_ssa_name (vec_dest, new_stmt);
7288           gimple_assign_set_lhs (new_stmt, new_temp);
7289           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7290         }
7291
7292       if (slp_node)
7293         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7294       else if (single_defuse_cycle
7295                && i < ncopies - 1)
7296         {
7297           if (reduc_index == 0)
7298             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7299           else if (reduc_index == 1)
7300             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7301           else if (reduc_index == 2)
7302             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7303         }
7304       else
7305         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7306     }
7307
7308   if (!slp_node)
7309     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7310
7311   return true;
7312 }
7313
7314 /* Transform phase of a cycle PHI.  */
7315
7316 bool
7317 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7318                           stmt_vec_info stmt_info, gimple **vec_stmt,
7319                           slp_tree slp_node, slp_instance slp_node_instance)
7320 {
7321   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7322   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7323   int i;
7324   int ncopies;
7325   int j;
7326   bool nested_cycle = false;
7327   int vec_num;
7328
7329   if (nested_in_vect_loop_p (loop, stmt_info))
7330     {
7331       loop = loop->inner;
7332       nested_cycle = true;
7333     }
7334
7335   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7336   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7337   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7338   gcc_assert (reduc_info->is_reduc_info);
7339
7340   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7341       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7342     /* Leave the scalar phi in place.  */
7343     return true;
7344
7345   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7346   /* For a nested cycle we do not fill the above.  */
7347   if (!vectype_in)
7348     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7349   gcc_assert (vectype_in);
7350
7351   if (slp_node)
7352     {
7353       /* The size vect_schedule_slp_instance computes is off for us.  */
7354       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7355                                       * SLP_TREE_LANES (slp_node), vectype_in);
7356       ncopies = 1;
7357     }
7358   else
7359     {
7360       vec_num = 1;
7361       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7362     }
7363
7364   /* Check whether we should use a single PHI node and accumulate
7365      vectors to one before the backedge.  */
7366   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7367     ncopies = 1;
7368
7369   /* Create the destination vector  */
7370   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7371   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7372                                                vectype_out);
7373
7374   /* Get the loop-entry arguments.  */
7375   tree vec_initial_def;
7376   auto_vec<tree> vec_initial_defs;
7377   if (slp_node)
7378     {
7379       vec_initial_defs.reserve (vec_num);
7380       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7381       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7382       tree neutral_op
7383         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7384                                         STMT_VINFO_REDUC_CODE (reduc_info),
7385                                         first != NULL);
7386       get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7387                                       &vec_initial_defs, vec_num,
7388                                       first != NULL, neutral_op);
7389     }
7390   else
7391     {
7392       /* Get at the scalar def before the loop, that defines the initial
7393          value of the reduction variable.  */
7394       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7395                                                 loop_preheader_edge (loop));
7396       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7397          and we can't use zero for induc_val, use initial_def.  Similarly
7398          for REDUC_MIN and initial_def larger than the base.  */
7399       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7400         {
7401           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7402           if (TREE_CODE (initial_def) == INTEGER_CST
7403               && !integer_zerop (induc_val)
7404               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7405                    && tree_int_cst_lt (initial_def, induc_val))
7406                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7407                       && tree_int_cst_lt (induc_val, initial_def))))
7408             {
7409               induc_val = initial_def;
7410               /* Communicate we used the initial_def to epilouge
7411                  generation.  */
7412               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7413             }
7414           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7415           vec_initial_defs.create (ncopies);
7416           for (i = 0; i < ncopies; ++i)
7417             vec_initial_defs.quick_push (vec_initial_def);
7418         }
7419       else if (nested_cycle)
7420         {
7421           /* Do not use an adjustment def as that case is not supported
7422              correctly if ncopies is not one.  */
7423           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7424                                          ncopies, initial_def,
7425                                          &vec_initial_defs);
7426         }
7427       else
7428         {
7429           tree adjustment_def = NULL_TREE;
7430           tree *adjustment_defp = &adjustment_def;
7431           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7432           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7433             adjustment_defp = NULL;
7434           vec_initial_def
7435             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7436                                              initial_def, adjustment_defp);
7437           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7438           vec_initial_defs.create (ncopies);
7439           for (i = 0; i < ncopies; ++i)
7440             vec_initial_defs.quick_push (vec_initial_def);
7441         }
7442     }
7443
7444   /* Generate the reduction PHIs upfront.  */
7445   for (i = 0; i < vec_num; i++)
7446     {
7447       tree vec_init_def = vec_initial_defs[i];
7448       for (j = 0; j < ncopies; j++)
7449         {
7450           /* Create the reduction-phi that defines the reduction
7451              operand.  */
7452           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7453
7454           /* Set the loop-entry arg of the reduction-phi.  */
7455           if (j != 0 && nested_cycle)
7456             vec_init_def = vec_initial_defs[j];
7457           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7458                        UNKNOWN_LOCATION);
7459
7460           /* The loop-latch arg is set in epilogue processing.  */
7461
7462           if (slp_node)
7463             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7464           else
7465             {
7466               if (j == 0)
7467                 *vec_stmt = new_phi;
7468               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7469             }
7470         }
7471     }
7472
7473   return true;
7474 }
7475
7476 /* Vectorizes LC PHIs.  */
7477
7478 bool
7479 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7480                      stmt_vec_info stmt_info, gimple **vec_stmt,
7481                      slp_tree slp_node)
7482 {
7483   if (!loop_vinfo
7484       || !is_a <gphi *> (stmt_info->stmt)
7485       || gimple_phi_num_args (stmt_info->stmt) != 1)
7486     return false;
7487
7488   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7489       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7490     return false;
7491
7492   if (!vec_stmt) /* transformation not required.  */
7493     {
7494       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7495       return true;
7496     }
7497
7498   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7499   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7500   basic_block bb = gimple_bb (stmt_info->stmt);
7501   edge e = single_pred_edge (bb);
7502   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7503   auto_vec<tree> vec_oprnds;
7504   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7505                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7506                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7507   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7508     {
7509       /* Create the vectorized LC PHI node.  */
7510       gphi *new_phi = create_phi_node (vec_dest, bb);
7511       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7512       if (slp_node)
7513         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7514       else
7515         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7516     }
7517   if (!slp_node)
7518     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7519
7520   return true;
7521 }
7522
7523
7524 /* Function vect_min_worthwhile_factor.
7525
7526    For a loop where we could vectorize the operation indicated by CODE,
7527    return the minimum vectorization factor that makes it worthwhile
7528    to use generic vectors.  */
7529 static unsigned int
7530 vect_min_worthwhile_factor (enum tree_code code)
7531 {
7532   switch (code)
7533     {
7534     case PLUS_EXPR:
7535     case MINUS_EXPR:
7536     case NEGATE_EXPR:
7537       return 4;
7538
7539     case BIT_AND_EXPR:
7540     case BIT_IOR_EXPR:
7541     case BIT_XOR_EXPR:
7542     case BIT_NOT_EXPR:
7543       return 2;
7544
7545     default:
7546       return INT_MAX;
7547     }
7548 }
7549
7550 /* Return true if VINFO indicates we are doing loop vectorization and if
7551    it is worth decomposing CODE operations into scalar operations for
7552    that loop's vectorization factor.  */
7553
7554 bool
7555 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7556 {
7557   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7558   unsigned HOST_WIDE_INT value;
7559   return (loop_vinfo
7560           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7561           && value >= vect_min_worthwhile_factor (code));
7562 }
7563
7564 /* Function vectorizable_induction
7565
7566    Check if STMT_INFO performs an induction computation that can be vectorized.
7567    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7568    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7569    Return true if STMT_INFO is vectorizable in this way.  */
7570
7571 bool
7572 vectorizable_induction (loop_vec_info loop_vinfo,
7573                         stmt_vec_info stmt_info,
7574                         gimple **vec_stmt, slp_tree slp_node,
7575                         stmt_vector_for_cost *cost_vec)
7576 {
7577   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7578   unsigned ncopies;
7579   bool nested_in_vect_loop = false;
7580   class loop *iv_loop;
7581   tree vec_def;
7582   edge pe = loop_preheader_edge (loop);
7583   basic_block new_bb;
7584   tree new_vec, vec_init, vec_step, t;
7585   tree new_name;
7586   gimple *new_stmt;
7587   gphi *induction_phi;
7588   tree induc_def, vec_dest;
7589   tree init_expr, step_expr;
7590   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7591   unsigned i;
7592   tree expr;
7593   gimple_seq stmts;
7594   gimple_stmt_iterator si;
7595
7596   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7597   if (!phi)
7598     return false;
7599
7600   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7601     return false;
7602
7603   /* Make sure it was recognized as induction computation.  */
7604   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7605     return false;
7606
7607   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7608   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7609
7610   if (slp_node)
7611     ncopies = 1;
7612   else
7613     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7614   gcc_assert (ncopies >= 1);
7615
7616   /* FORNOW. These restrictions should be relaxed.  */
7617   if (nested_in_vect_loop_p (loop, stmt_info))
7618     {
7619       imm_use_iterator imm_iter;
7620       use_operand_p use_p;
7621       gimple *exit_phi;
7622       edge latch_e;
7623       tree loop_arg;
7624
7625       if (ncopies > 1)
7626         {
7627           if (dump_enabled_p ())
7628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7629                              "multiple types in nested loop.\n");
7630           return false;
7631         }
7632
7633       /* FORNOW: outer loop induction with SLP not supported.  */
7634       if (STMT_SLP_TYPE (stmt_info))
7635         return false;
7636
7637       exit_phi = NULL;
7638       latch_e = loop_latch_edge (loop->inner);
7639       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7640       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7641         {
7642           gimple *use_stmt = USE_STMT (use_p);
7643           if (is_gimple_debug (use_stmt))
7644             continue;
7645
7646           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7647             {
7648               exit_phi = use_stmt;
7649               break;
7650             }
7651         }
7652       if (exit_phi)
7653         {
7654           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7655           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7656                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7657             {
7658               if (dump_enabled_p ())
7659                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660                                  "inner-loop induction only used outside "
7661                                  "of the outer vectorized loop.\n");
7662               return false;
7663             }
7664         }
7665
7666       nested_in_vect_loop = true;
7667       iv_loop = loop->inner;
7668     }
7669   else
7670     iv_loop = loop;
7671   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7672
7673   if (slp_node && !nunits.is_constant ())
7674     {
7675       /* The current SLP code creates the initial value element-by-element.  */
7676       if (dump_enabled_p ())
7677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7678                          "SLP induction not supported for variable-length"
7679                          " vectors.\n");
7680       return false;
7681     }
7682
7683   if (!vec_stmt) /* transformation not required.  */
7684     {
7685       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7686       DUMP_VECT_SCOPE ("vectorizable_induction");
7687       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7688       return true;
7689     }
7690
7691   /* Transform.  */
7692
7693   /* Compute a vector variable, initialized with the first VF values of
7694      the induction variable.  E.g., for an iv with IV_PHI='X' and
7695      evolution S, for a vector of 4 units, we want to compute:
7696      [X, X + S, X + 2*S, X + 3*S].  */
7697
7698   if (dump_enabled_p ())
7699     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7700
7701   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7702   gcc_assert (step_expr != NULL_TREE);
7703   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7704
7705   pe = loop_preheader_edge (iv_loop);
7706   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7707                                      loop_preheader_edge (iv_loop));
7708
7709   stmts = NULL;
7710   if (!nested_in_vect_loop)
7711     {
7712       /* Convert the initial value to the IV update type.  */
7713       tree new_type = TREE_TYPE (step_expr);
7714       init_expr = gimple_convert (&stmts, new_type, init_expr);
7715
7716       /* If we are using the loop mask to "peel" for alignment then we need
7717          to adjust the start value here.  */
7718       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7719       if (skip_niters != NULL_TREE)
7720         {
7721           if (FLOAT_TYPE_P (vectype))
7722             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7723                                         skip_niters);
7724           else
7725             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7726           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7727                                          skip_niters, step_expr);
7728           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7729                                     init_expr, skip_step);
7730         }
7731     }
7732
7733   if (stmts)
7734     {
7735       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7736       gcc_assert (!new_bb);
7737     }
7738
7739   /* Find the first insertion point in the BB.  */
7740   basic_block bb = gimple_bb (phi);
7741   si = gsi_after_labels (bb);
7742
7743   /* For SLP induction we have to generate several IVs as for example
7744      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7745      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7746      [VF*S, VF*S, VF*S, VF*S] for all.  */
7747   if (slp_node)
7748     {
7749       /* Enforced above.  */
7750       unsigned int const_nunits = nunits.to_constant ();
7751
7752       /* Generate [VF*S, VF*S, ... ].  */
7753       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7754         {
7755           expr = build_int_cst (integer_type_node, vf);
7756           expr = fold_convert (TREE_TYPE (step_expr), expr);
7757         }
7758       else
7759         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7760       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7761                               expr, step_expr);
7762       if (! CONSTANT_CLASS_P (new_name))
7763         new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7764                                      TREE_TYPE (step_expr), NULL);
7765       new_vec = build_vector_from_val (step_vectype, new_name);
7766       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7767                                    new_vec, step_vectype, NULL);
7768
7769       /* Now generate the IVs.  */
7770       unsigned group_size = SLP_TREE_LANES (slp_node);
7771       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7772       unsigned elts = const_nunits * nvects;
7773       /* Compute the number of distinct IVs we need.  First reduce
7774          group_size if it is a multiple of const_nunits so we get
7775          one IV for a group_size of 4 but const_nunits 2.  */
7776       unsigned group_sizep = group_size;
7777       if (group_sizep % const_nunits == 0)
7778         group_sizep = group_sizep / const_nunits;
7779       unsigned nivs = least_common_multiple (group_sizep,
7780                                              const_nunits) / const_nunits;
7781       gcc_assert (elts % group_size == 0);
7782       tree elt = init_expr;
7783       unsigned ivn;
7784       for (ivn = 0; ivn < nivs; ++ivn)
7785         {
7786           tree_vector_builder elts (step_vectype, const_nunits, 1);
7787           stmts = NULL;
7788           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7789             {
7790               if (ivn*const_nunits + eltn >= group_size
7791                   && (ivn * const_nunits + eltn) % group_size == 0)
7792                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7793                                     elt, step_expr);
7794               elts.quick_push (elt);
7795             }
7796           vec_init = gimple_build_vector (&stmts, &elts);
7797           vec_init = gimple_convert (&stmts, vectype, vec_init);
7798           if (stmts)
7799             {
7800               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7801               gcc_assert (!new_bb);
7802             }
7803
7804           /* Create the induction-phi that defines the induction-operand.  */
7805           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7806           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7807           induc_def = PHI_RESULT (induction_phi);
7808
7809           /* Create the iv update inside the loop  */
7810           gimple_seq stmts = NULL;
7811           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7812           vec_def = gimple_build (&stmts,
7813                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7814           vec_def = gimple_convert (&stmts, vectype, vec_def);
7815           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7816
7817           /* Set the arguments of the phi node:  */
7818           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7819           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7820                        UNKNOWN_LOCATION);
7821
7822           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7823         }
7824       /* Fill up to the number of vectors we need for the whole group.  */
7825       nivs = least_common_multiple (group_size,
7826                                     const_nunits) / const_nunits;
7827       for (; ivn < nivs; ++ivn)
7828         SLP_TREE_VEC_STMTS (slp_node)
7829           .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7830
7831       /* Re-use IVs when we can.  */
7832       if (ivn < nvects)
7833         {
7834           unsigned vfp
7835             = least_common_multiple (group_size, const_nunits) / group_size;
7836           /* Generate [VF'*S, VF'*S, ... ].  */
7837           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7838             {
7839               expr = build_int_cst (integer_type_node, vfp);
7840               expr = fold_convert (TREE_TYPE (step_expr), expr);
7841             }
7842           else
7843             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7844           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7845                                   expr, step_expr);
7846           if (! CONSTANT_CLASS_P (new_name))
7847             new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7848                                          TREE_TYPE (step_expr), NULL);
7849           new_vec = build_vector_from_val (step_vectype, new_name);
7850           vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7851                                        step_vectype, NULL);
7852           for (; ivn < nvects; ++ivn)
7853             {
7854               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7855               tree def;
7856               if (gimple_code (iv) == GIMPLE_PHI)
7857                 def = gimple_phi_result (iv);
7858               else
7859                 def = gimple_assign_lhs (iv);
7860               gimple_seq stmts = NULL;
7861               def = gimple_convert (&stmts, step_vectype, def);
7862               def = gimple_build (&stmts,
7863                                   PLUS_EXPR, step_vectype, def, vec_step);
7864               def = gimple_convert (&stmts, vectype, def);
7865               if (gimple_code (iv) == GIMPLE_PHI)
7866                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7867               else
7868                 {
7869                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7870                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7871                 }
7872               SLP_TREE_VEC_STMTS (slp_node)
7873                 .quick_push (SSA_NAME_DEF_STMT (def));
7874             }
7875         }
7876
7877       return true;
7878     }
7879
7880   /* Create the vector that holds the initial_value of the induction.  */
7881   if (nested_in_vect_loop)
7882     {
7883       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7884          been created during vectorization of previous stmts.  We obtain it
7885          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7886       auto_vec<tree> vec_inits;
7887       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7888                                      init_expr, &vec_inits);
7889       vec_init = vec_inits[0];
7890       /* If the initial value is not of proper type, convert it.  */
7891       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7892         {
7893           new_stmt
7894             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7895                                                           vect_simple_var,
7896                                                           "vec_iv_"),
7897                                    VIEW_CONVERT_EXPR,
7898                                    build1 (VIEW_CONVERT_EXPR, vectype,
7899                                            vec_init));
7900           vec_init = gimple_assign_lhs (new_stmt);
7901           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7902                                                  new_stmt);
7903           gcc_assert (!new_bb);
7904         }
7905     }
7906   else
7907     {
7908       /* iv_loop is the loop to be vectorized. Create:
7909          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7910       stmts = NULL;
7911       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7912
7913       unsigned HOST_WIDE_INT const_nunits;
7914       if (nunits.is_constant (&const_nunits))
7915         {
7916           tree_vector_builder elts (step_vectype, const_nunits, 1);
7917           elts.quick_push (new_name);
7918           for (i = 1; i < const_nunits; i++)
7919             {
7920               /* Create: new_name_i = new_name + step_expr  */
7921               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7922                                        new_name, step_expr);
7923               elts.quick_push (new_name);
7924             }
7925           /* Create a vector from [new_name_0, new_name_1, ...,
7926              new_name_nunits-1]  */
7927           vec_init = gimple_build_vector (&stmts, &elts);
7928         }
7929       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7930         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7931         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7932                                  new_name, step_expr);
7933       else
7934         {
7935           /* Build:
7936                 [base, base, base, ...]
7937                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7938           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7939           gcc_assert (flag_associative_math);
7940           tree index = build_index_vector (step_vectype, 0, 1);
7941           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7942                                                         new_name);
7943           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7944                                                         step_expr);
7945           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7946           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7947                                    vec_init, step_vec);
7948           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7949                                    vec_init, base_vec);
7950         }
7951       vec_init = gimple_convert (&stmts, vectype, vec_init);
7952
7953       if (stmts)
7954         {
7955           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7956           gcc_assert (!new_bb);
7957         }
7958     }
7959
7960
7961   /* Create the vector that holds the step of the induction.  */
7962   if (nested_in_vect_loop)
7963     /* iv_loop is nested in the loop to be vectorized. Generate:
7964        vec_step = [S, S, S, S]  */
7965     new_name = step_expr;
7966   else
7967     {
7968       /* iv_loop is the loop to be vectorized. Generate:
7969           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7970       gimple_seq seq = NULL;
7971       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7972         {
7973           expr = build_int_cst (integer_type_node, vf);
7974           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7975         }
7976       else
7977         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7978       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7979                                expr, step_expr);
7980       if (seq)
7981         {
7982           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7983           gcc_assert (!new_bb);
7984         }
7985     }
7986
7987   t = unshare_expr (new_name);
7988   gcc_assert (CONSTANT_CLASS_P (new_name)
7989               || TREE_CODE (new_name) == SSA_NAME);
7990   new_vec = build_vector_from_val (step_vectype, t);
7991   vec_step = vect_init_vector (loop_vinfo, stmt_info,
7992                                new_vec, step_vectype, NULL);
7993
7994
7995   /* Create the following def-use cycle:
7996      loop prolog:
7997          vec_init = ...
7998          vec_step = ...
7999      loop:
8000          vec_iv = PHI <vec_init, vec_loop>
8001          ...
8002          STMT
8003          ...
8004          vec_loop = vec_iv + vec_step;  */
8005
8006   /* Create the induction-phi that defines the induction-operand.  */
8007   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8008   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8009   induc_def = PHI_RESULT (induction_phi);
8010
8011   /* Create the iv update inside the loop  */
8012   stmts = NULL;
8013   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8014   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8015   vec_def = gimple_convert (&stmts, vectype, vec_def);
8016   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8017   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8018
8019   /* Set the arguments of the phi node:  */
8020   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8021   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8022                UNKNOWN_LOCATION);
8023
8024   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8025   *vec_stmt = induction_phi;
8026
8027   /* In case that vectorization factor (VF) is bigger than the number
8028      of elements that we can fit in a vectype (nunits), we have to generate
8029      more than one vector stmt - i.e - we need to "unroll" the
8030      vector stmt by a factor VF/nunits.  For more details see documentation
8031      in vectorizable_operation.  */
8032
8033   if (ncopies > 1)
8034     {
8035       gimple_seq seq = NULL;
8036       /* FORNOW. This restriction should be relaxed.  */
8037       gcc_assert (!nested_in_vect_loop);
8038
8039       /* Create the vector that holds the step of the induction.  */
8040       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8041         {
8042           expr = build_int_cst (integer_type_node, nunits);
8043           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8044         }
8045       else
8046         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8047       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8048                                expr, step_expr);
8049       if (seq)
8050         {
8051           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8052           gcc_assert (!new_bb);
8053         }
8054
8055       t = unshare_expr (new_name);
8056       gcc_assert (CONSTANT_CLASS_P (new_name)
8057                   || TREE_CODE (new_name) == SSA_NAME);
8058       new_vec = build_vector_from_val (step_vectype, t);
8059       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8060                                    new_vec, step_vectype, NULL);
8061
8062       vec_def = induc_def;
8063       for (i = 1; i < ncopies; i++)
8064         {
8065           /* vec_i = vec_prev + vec_step  */
8066           gimple_seq stmts = NULL;
8067           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8068           vec_def = gimple_build (&stmts,
8069                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8070           vec_def = gimple_convert (&stmts, vectype, vec_def);
8071
8072           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8073           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8074           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8075         }
8076     }
8077
8078   if (dump_enabled_p ())
8079     dump_printf_loc (MSG_NOTE, vect_location,
8080                      "transform induction: created def-use cycle: %G%G",
8081                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8082
8083   return true;
8084 }
8085
8086 /* Function vectorizable_live_operation.
8087
8088    STMT_INFO computes a value that is used outside the loop.  Check if
8089    it can be supported.  */
8090
8091 bool
8092 vectorizable_live_operation (vec_info *vinfo,
8093                              stmt_vec_info stmt_info,
8094                              gimple_stmt_iterator *gsi,
8095                              slp_tree slp_node, slp_instance slp_node_instance,
8096                              int slp_index, bool vec_stmt_p,
8097                              stmt_vector_for_cost *cost_vec)
8098 {
8099   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8100   imm_use_iterator imm_iter;
8101   tree lhs, lhs_type, bitsize, vec_bitsize;
8102   tree vectype = (slp_node
8103                   ? SLP_TREE_VECTYPE (slp_node)
8104                   : STMT_VINFO_VECTYPE (stmt_info));
8105   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8106   int ncopies;
8107   gimple *use_stmt;
8108   auto_vec<tree> vec_oprnds;
8109   int vec_entry = 0;
8110   poly_uint64 vec_index = 0;
8111
8112   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8113
8114   /* If a stmt of a reduction is live, vectorize it via
8115      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8116      validity so just trigger the transform here.  */
8117   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8118     {
8119       if (!vec_stmt_p)
8120         return true;
8121       if (slp_node)
8122         {
8123           /* For reduction chains the meta-info is attached to
8124              the group leader.  */
8125           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8126             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8127           /* For SLP reductions we vectorize the epilogue for
8128              all involved stmts together.  */
8129           else if (slp_index != 0)
8130             return true;
8131           else
8132             /* For SLP reductions the meta-info is attached to
8133                the representative.  */
8134             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8135         }
8136       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8137       gcc_assert (reduc_info->is_reduc_info);
8138       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8139           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8140         return true;
8141       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8142                                         slp_node_instance);
8143       return true;
8144     }
8145
8146   /* If STMT is not relevant and it is a simple assignment and its inputs are
8147      invariant then it can remain in place, unvectorized.  The original last
8148      scalar value that it computes will be used.  */
8149   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8150     {
8151       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8152       if (dump_enabled_p ())
8153         dump_printf_loc (MSG_NOTE, vect_location,
8154                          "statement is simple and uses invariant.  Leaving in "
8155                          "place.\n");
8156       return true;
8157     }
8158
8159   if (slp_node)
8160     ncopies = 1;
8161   else
8162     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8163
8164   if (slp_node)
8165     {
8166       gcc_assert (slp_index >= 0);
8167
8168       /* Get the last occurrence of the scalar index from the concatenation of
8169          all the slp vectors. Calculate which slp vector it is and the index
8170          within.  */
8171       int num_scalar = SLP_TREE_LANES (slp_node);
8172       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8173       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8174
8175       /* Calculate which vector contains the result, and which lane of
8176          that vector we need.  */
8177       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8178         {
8179           if (dump_enabled_p ())
8180             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181                              "Cannot determine which vector holds the"
8182                              " final result.\n");
8183           return false;
8184         }
8185     }
8186
8187   if (!vec_stmt_p)
8188     {
8189       /* No transformation required.  */
8190       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8191         {
8192           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8193                                                OPTIMIZE_FOR_SPEED))
8194             {
8195               if (dump_enabled_p ())
8196                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8197                                  "can't operate on partial vectors "
8198                                  "because the target doesn't support extract "
8199                                  "last reduction.\n");
8200               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8201             }
8202           else if (slp_node)
8203             {
8204               if (dump_enabled_p ())
8205                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8206                                  "can't operate on partial vectors "
8207                                  "because an SLP statement is live after "
8208                                  "the loop.\n");
8209               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8210             }
8211           else if (ncopies > 1)
8212             {
8213               if (dump_enabled_p ())
8214                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8215                                  "can't operate on partial vectors "
8216                                  "because ncopies is greater than 1.\n");
8217               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8218             }
8219           else
8220             {
8221               gcc_assert (ncopies == 1 && !slp_node);
8222               vect_record_loop_mask (loop_vinfo,
8223                                      &LOOP_VINFO_MASKS (loop_vinfo),
8224                                      1, vectype, NULL);
8225             }
8226         }
8227       /* ???  Enable for loop costing as well.  */
8228       if (!loop_vinfo)
8229         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8230                           0, vect_epilogue);
8231       return true;
8232     }
8233
8234   /* Use the lhs of the original scalar statement.  */
8235   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8236   if (dump_enabled_p ())
8237     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8238                      "stmt %G", stmt);
8239
8240   lhs = gimple_get_lhs (stmt);
8241   lhs_type = TREE_TYPE (lhs);
8242
8243   bitsize = vector_element_bits_tree (vectype);
8244   vec_bitsize = TYPE_SIZE (vectype);
8245
8246   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8247   tree vec_lhs, bitstart;
8248   gimple *vec_stmt;
8249   if (slp_node)
8250     {
8251       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8252
8253       /* Get the correct slp vectorized stmt.  */
8254       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8255       vec_lhs = gimple_get_lhs (vec_stmt);
8256
8257       /* Get entry to use.  */
8258       bitstart = bitsize_int (vec_index);
8259       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8260     }
8261   else
8262     {
8263       /* For multiple copies, get the last copy.  */
8264       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8265       vec_lhs = gimple_get_lhs (vec_stmt);
8266
8267       /* Get the last lane in the vector.  */
8268       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8269     }
8270
8271   if (loop_vinfo)
8272     {
8273       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8274          requirement, insert one phi node for it.  It looks like:
8275            loop;
8276          BB:
8277            # lhs' = PHI <lhs>
8278          ==>
8279            loop;
8280          BB:
8281            # vec_lhs' = PHI <vec_lhs>
8282            new_tree = lane_extract <vec_lhs', ...>;
8283            lhs' = new_tree;  */
8284
8285       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8286       basic_block exit_bb = single_exit (loop)->dest;
8287       gcc_assert (single_pred_p (exit_bb));
8288
8289       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8290       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8291       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8292
8293       gimple_seq stmts = NULL;
8294       tree new_tree;
8295       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8296         {
8297           /* Emit:
8298
8299                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8300
8301              where VEC_LHS is the vectorized live-out result and MASK is
8302              the loop mask for the final iteration.  */
8303           gcc_assert (ncopies == 1 && !slp_node);
8304           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8305           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8306                                           1, vectype, 0);
8307           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8308                                           mask, vec_lhs_phi);
8309
8310           /* Convert the extracted vector element to the scalar type.  */
8311           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8312         }
8313       else
8314         {
8315           tree bftype = TREE_TYPE (vectype);
8316           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8317             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8318           new_tree = build3 (BIT_FIELD_REF, bftype,
8319                              vec_lhs_phi, bitsize, bitstart);
8320           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8321                                            &stmts, true, NULL_TREE);
8322         }
8323
8324       if (stmts)
8325         {
8326           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8327           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8328
8329           /* Remove existing phi from lhs and create one copy from new_tree.  */
8330           tree lhs_phi = NULL_TREE;
8331           gimple_stmt_iterator gsi;
8332           for (gsi = gsi_start_phis (exit_bb);
8333                !gsi_end_p (gsi); gsi_next (&gsi))
8334             {
8335               gimple *phi = gsi_stmt (gsi);
8336               if ((gimple_phi_arg_def (phi, 0) == lhs))
8337                 {
8338                   remove_phi_node (&gsi, false);
8339                   lhs_phi = gimple_phi_result (phi);
8340                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8341                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8342                   break;
8343                 }
8344             }
8345         }
8346
8347       /* Replace use of lhs with newly computed result.  If the use stmt is a
8348          single arg PHI, just replace all uses of PHI result.  It's necessary
8349          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8350       use_operand_p use_p;
8351       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8352         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8353             && !is_gimple_debug (use_stmt))
8354           {
8355             if (gimple_code (use_stmt) == GIMPLE_PHI
8356                 && gimple_phi_num_args (use_stmt) == 1)
8357               {
8358                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8359               }
8360             else
8361               {
8362                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8363                     SET_USE (use_p, new_tree);
8364               }
8365             update_stmt (use_stmt);
8366           }
8367     }
8368   else
8369     {
8370       /* For basic-block vectorization simply insert the lane-extraction.  */
8371       tree bftype = TREE_TYPE (vectype);
8372       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8373         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8374       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8375                               vec_lhs, bitsize, bitstart);
8376       gimple_seq stmts = NULL;
8377       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8378                                        &stmts, true, NULL_TREE);
8379
8380       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
8381
8382       /* Replace use of lhs with newly computed result.  If the use stmt is a
8383          single arg PHI, just replace all uses of PHI result.  It's necessary
8384          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8385       use_operand_p use_p;
8386       stmt_vec_info use_stmt_info;
8387       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8388         if (!is_gimple_debug (use_stmt)
8389             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8390                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8391           {
8392             /* ???  This can happen when the live lane ends up being
8393                used in a vector construction code-generated by an
8394                external SLP node (and code-generation for that already
8395                happened).  See gcc.dg/vect/bb-slp-47.c.
8396                Doing this is what would happen if that vector CTOR
8397                were not code-generated yet so it is not too bad.
8398                ???  In fact we'd likely want to avoid this situation
8399                in the first place.  */
8400             if (TREE_CODE (new_tree) == SSA_NAME
8401                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8402                 && gimple_code (use_stmt) != GIMPLE_PHI
8403                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8404                                                 use_stmt))
8405               {
8406                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8407                 gcc_assert (code == CONSTRUCTOR
8408                             || code == VIEW_CONVERT_EXPR
8409                             || CONVERT_EXPR_CODE_P (code));
8410                 if (dump_enabled_p ())
8411                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8412                                    "Using original scalar computation for "
8413                                    "live lane because use preceeds vector "
8414                                    "def\n");
8415                 continue;
8416               }
8417             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8418               SET_USE (use_p, new_tree);
8419             update_stmt (use_stmt);
8420           }
8421     }
8422
8423   return true;
8424 }
8425
8426 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8427
8428 static void
8429 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8430 {
8431   ssa_op_iter op_iter;
8432   imm_use_iterator imm_iter;
8433   def_operand_p def_p;
8434   gimple *ustmt;
8435
8436   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8437     {
8438       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8439         {
8440           basic_block bb;
8441
8442           if (!is_gimple_debug (ustmt))
8443             continue;
8444
8445           bb = gimple_bb (ustmt);
8446
8447           if (!flow_bb_inside_loop_p (loop, bb))
8448             {
8449               if (gimple_debug_bind_p (ustmt))
8450                 {
8451                   if (dump_enabled_p ())
8452                     dump_printf_loc (MSG_NOTE, vect_location,
8453                                      "killing debug use\n");
8454
8455                   gimple_debug_bind_reset_value (ustmt);
8456                   update_stmt (ustmt);
8457                 }
8458               else
8459                 gcc_unreachable ();
8460             }
8461         }
8462     }
8463 }
8464
8465 /* Given loop represented by LOOP_VINFO, return true if computation of
8466    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8467    otherwise.  */
8468
8469 static bool
8470 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8471 {
8472   /* Constant case.  */
8473   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8474     {
8475       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8476       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8477
8478       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8479       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8480       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8481         return true;
8482     }
8483
8484   widest_int max;
8485   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8486   /* Check the upper bound of loop niters.  */
8487   if (get_max_loop_iterations (loop, &max))
8488     {
8489       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8490       signop sgn = TYPE_SIGN (type);
8491       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8492       if (max < type_max)
8493         return true;
8494     }
8495   return false;
8496 }
8497
8498 /* Return a mask type with half the number of elements as OLD_TYPE,
8499    given that it should have mode NEW_MODE.  */
8500
8501 tree
8502 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8503 {
8504   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8505   return build_truth_vector_type_for_mode (nunits, new_mode);
8506 }
8507
8508 /* Return a mask type with twice as many elements as OLD_TYPE,
8509    given that it should have mode NEW_MODE.  */
8510
8511 tree
8512 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8513 {
8514   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8515   return build_truth_vector_type_for_mode (nunits, new_mode);
8516 }
8517
8518 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8519    contain a sequence of NVECTORS masks that each control a vector of type
8520    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8521    these vector masks with the vector version of SCALAR_MASK.  */
8522
8523 void
8524 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8525                        unsigned int nvectors, tree vectype, tree scalar_mask)
8526 {
8527   gcc_assert (nvectors != 0);
8528   if (masks->length () < nvectors)
8529     masks->safe_grow_cleared (nvectors, true);
8530   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8531   /* The number of scalars per iteration and the number of vectors are
8532      both compile-time constants.  */
8533   unsigned int nscalars_per_iter
8534     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8535                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8536
8537   if (scalar_mask)
8538     {
8539       scalar_cond_masked_key cond (scalar_mask, nvectors);
8540       loop_vinfo->scalar_cond_masked_set.add (cond);
8541     }
8542
8543   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8544     {
8545       rgm->max_nscalars_per_iter = nscalars_per_iter;
8546       rgm->type = truth_type_for (vectype);
8547       rgm->factor = 1;
8548     }
8549 }
8550
8551 /* Given a complete set of masks MASKS, extract mask number INDEX
8552    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8553    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8554
8555    See the comment above vec_loop_masks for more details about the mask
8556    arrangement.  */
8557
8558 tree
8559 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8560                     unsigned int nvectors, tree vectype, unsigned int index)
8561 {
8562   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8563   tree mask_type = rgm->type;
8564
8565   /* Populate the rgroup's mask array, if this is the first time we've
8566      used it.  */
8567   if (rgm->controls.is_empty ())
8568     {
8569       rgm->controls.safe_grow_cleared (nvectors, true);
8570       for (unsigned int i = 0; i < nvectors; ++i)
8571         {
8572           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8573           /* Provide a dummy definition until the real one is available.  */
8574           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8575           rgm->controls[i] = mask;
8576         }
8577     }
8578
8579   tree mask = rgm->controls[index];
8580   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8581                 TYPE_VECTOR_SUBPARTS (vectype)))
8582     {
8583       /* A loop mask for data type X can be reused for data type Y
8584          if X has N times more elements than Y and if Y's elements
8585          are N times bigger than X's.  In this case each sequence
8586          of N elements in the loop mask will be all-zero or all-one.
8587          We can then view-convert the mask so that each sequence of
8588          N elements is replaced by a single element.  */
8589       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8590                               TYPE_VECTOR_SUBPARTS (vectype)));
8591       gimple_seq seq = NULL;
8592       mask_type = truth_type_for (vectype);
8593       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8594       if (seq)
8595         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8596     }
8597   return mask;
8598 }
8599
8600 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8601    lengths for controlling an operation on VECTYPE.  The operation splits
8602    each element of VECTYPE into FACTOR separate subelements, measuring the
8603    length as a number of these subelements.  */
8604
8605 void
8606 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8607                       unsigned int nvectors, tree vectype, unsigned int factor)
8608 {
8609   gcc_assert (nvectors != 0);
8610   if (lens->length () < nvectors)
8611     lens->safe_grow_cleared (nvectors, true);
8612   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8613
8614   /* The number of scalars per iteration, scalar occupied bytes and
8615      the number of vectors are both compile-time constants.  */
8616   unsigned int nscalars_per_iter
8617     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8618                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8619
8620   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8621     {
8622       /* For now, we only support cases in which all loads and stores fall back
8623          to VnQI or none do.  */
8624       gcc_assert (!rgl->max_nscalars_per_iter
8625                   || (rgl->factor == 1 && factor == 1)
8626                   || (rgl->max_nscalars_per_iter * rgl->factor
8627                       == nscalars_per_iter * factor));
8628       rgl->max_nscalars_per_iter = nscalars_per_iter;
8629       rgl->type = vectype;
8630       rgl->factor = factor;
8631     }
8632 }
8633
8634 /* Given a complete set of length LENS, extract length number INDEX for an
8635    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
8636
8637 tree
8638 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8639                    unsigned int nvectors, unsigned int index)
8640 {
8641   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8642
8643   /* Populate the rgroup's len array, if this is the first time we've
8644      used it.  */
8645   if (rgl->controls.is_empty ())
8646     {
8647       rgl->controls.safe_grow_cleared (nvectors, true);
8648       for (unsigned int i = 0; i < nvectors; ++i)
8649         {
8650           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8651           gcc_assert (len_type != NULL_TREE);
8652           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8653
8654           /* Provide a dummy definition until the real one is available.  */
8655           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8656           rgl->controls[i] = len;
8657         }
8658     }
8659
8660   return rgl->controls[index];
8661 }
8662
8663 /* Scale profiling counters by estimation for LOOP which is vectorized
8664    by factor VF.  */
8665
8666 static void
8667 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8668 {
8669   edge preheader = loop_preheader_edge (loop);
8670   /* Reduce loop iterations by the vectorization factor.  */
8671   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8672   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8673
8674   if (freq_h.nonzero_p ())
8675     {
8676       profile_probability p;
8677
8678       /* Avoid dropping loop body profile counter to 0 because of zero count
8679          in loop's preheader.  */
8680       if (!(freq_e == profile_count::zero ()))
8681         freq_e = freq_e.force_nonzero ();
8682       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8683       scale_loop_frequencies (loop, p);
8684     }
8685
8686   edge exit_e = single_exit (loop);
8687   exit_e->probability = profile_probability::always ()
8688                                  .apply_scale (1, new_est_niter + 1);
8689
8690   edge exit_l = single_pred_edge (loop->latch);
8691   profile_probability prob = exit_l->probability;
8692   exit_l->probability = exit_e->probability.invert ();
8693   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8694     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8695 }
8696
8697 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8698    latch edge values originally defined by it.  */
8699
8700 static void
8701 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8702                                      stmt_vec_info def_stmt_info)
8703 {
8704   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8705   if (!def || TREE_CODE (def) != SSA_NAME)
8706     return;
8707   stmt_vec_info phi_info;
8708   imm_use_iterator iter;
8709   use_operand_p use_p;
8710   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8711     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8712       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8713           && (phi_info = loop_vinfo->lookup_stmt (phi))
8714           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8715           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8716           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8717         {
8718           loop_p loop = gimple_bb (phi)->loop_father;
8719           edge e = loop_latch_edge (loop);
8720           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8721             {
8722               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8723               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8724               gcc_assert (phi_defs.length () == latch_defs.length ());
8725               for (unsigned i = 0; i < phi_defs.length (); ++i)
8726                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8727                              gimple_get_lhs (latch_defs[i]), e,
8728                              gimple_phi_arg_location (phi, e->dest_idx));
8729             }
8730         }
8731 }
8732
8733 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8734    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8735    stmt_vec_info.  */
8736
8737 static void
8738 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8739                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8740 {
8741   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8742   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8743
8744   if (dump_enabled_p ())
8745     dump_printf_loc (MSG_NOTE, vect_location,
8746                      "------>vectorizing statement: %G", stmt_info->stmt);
8747
8748   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8749     vect_loop_kill_debug_uses (loop, stmt_info);
8750
8751   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8752       && !STMT_VINFO_LIVE_P (stmt_info))
8753     return;
8754
8755   if (STMT_VINFO_VECTYPE (stmt_info))
8756     {
8757       poly_uint64 nunits
8758         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8759       if (!STMT_SLP_TYPE (stmt_info)
8760           && maybe_ne (nunits, vf)
8761           && dump_enabled_p ())
8762         /* For SLP VF is set according to unrolling factor, and not
8763            to vector size, hence for SLP this print is not valid.  */
8764         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8765     }
8766
8767   /* Pure SLP statements have already been vectorized.  We still need
8768      to apply loop vectorization to hybrid SLP statements.  */
8769   if (PURE_SLP_STMT (stmt_info))
8770     return;
8771
8772   if (dump_enabled_p ())
8773     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8774
8775   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8776     *seen_store = stmt_info;
8777 }
8778
8779 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8780    in the hash_map with its corresponding values.  */
8781
8782 static tree
8783 find_in_mapping (tree t, void *context)
8784 {
8785   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8786
8787   tree *value = mapping->get (t);
8788   return value ? *value : t;
8789 }
8790
8791 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8792    original loop that has now been vectorized.
8793
8794    The inits of the data_references need to be advanced with the number of
8795    iterations of the main loop.  This has been computed in vect_do_peeling and
8796    is stored in parameter ADVANCE.  We first restore the data_references
8797    initial offset with the values recored in ORIG_DRS_INIT.
8798
8799    Since the loop_vec_info of this EPILOGUE was constructed for the original
8800    loop, its stmt_vec_infos all point to the original statements.  These need
8801    to be updated to point to their corresponding copies as well as the SSA_NAMES
8802    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8803
8804    The data_reference's connections also need to be updated.  Their
8805    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8806    stmt_vec_infos, their statements need to point to their corresponding copy,
8807    if they are gather loads or scatter stores then their reference needs to be
8808    updated to point to its corresponding copy and finally we set
8809    'base_misaligned' to false as we have already peeled for alignment in the
8810    prologue of the main loop.  */
8811
8812 static void
8813 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8814 {
8815   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8816   auto_vec<gimple *> stmt_worklist;
8817   hash_map<tree,tree> mapping;
8818   gimple *orig_stmt, *new_stmt;
8819   gimple_stmt_iterator epilogue_gsi;
8820   gphi_iterator epilogue_phi_gsi;
8821   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8822   basic_block *epilogue_bbs = get_loop_body (epilogue);
8823   unsigned i;
8824
8825   free (LOOP_VINFO_BBS (epilogue_vinfo));
8826   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8827
8828   /* Advance data_reference's with the number of iterations of the previous
8829      loop and its prologue.  */
8830   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8831
8832
8833   /* The EPILOGUE loop is a copy of the original loop so they share the same
8834      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8835      point to the copied statements.  We also create a mapping of all LHS' in
8836      the original loop and all the LHS' in the EPILOGUE and create worklists to
8837      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8838   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8839     {
8840       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8841            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8842         {
8843           new_stmt = epilogue_phi_gsi.phi ();
8844
8845           gcc_assert (gimple_uid (new_stmt) > 0);
8846           stmt_vinfo
8847             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8848
8849           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8850           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8851
8852           mapping.put (gimple_phi_result (orig_stmt),
8853                        gimple_phi_result (new_stmt));
8854           /* PHI nodes can not have patterns or related statements.  */
8855           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8856                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8857         }
8858
8859       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8860            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8861         {
8862           new_stmt = gsi_stmt (epilogue_gsi);
8863           if (is_gimple_debug (new_stmt))
8864             continue;
8865
8866           gcc_assert (gimple_uid (new_stmt) > 0);
8867           stmt_vinfo
8868             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8869
8870           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8871           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8872
8873           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8874             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8875
8876           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8877             {
8878               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8879               for (gimple_stmt_iterator gsi = gsi_start (seq);
8880                    !gsi_end_p (gsi); gsi_next (&gsi))
8881                 stmt_worklist.safe_push (gsi_stmt (gsi));
8882             }
8883
8884           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8885           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8886             {
8887               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8888               stmt_worklist.safe_push (stmt);
8889               /* Set BB such that the assert in
8890                 'get_initial_def_for_reduction' is able to determine that
8891                 the BB of the related stmt is inside this loop.  */
8892               gimple_set_bb (stmt,
8893                              gimple_bb (new_stmt));
8894               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8895               gcc_assert (related_vinfo == NULL
8896                           || related_vinfo == stmt_vinfo);
8897             }
8898         }
8899     }
8900
8901   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8902      using the original main loop and thus need to be updated to refer to the
8903      cloned variables used in the epilogue.  */
8904   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8905     {
8906       gimple *stmt = stmt_worklist[i];
8907       tree *new_op;
8908
8909       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8910         {
8911           tree op = gimple_op (stmt, j);
8912           if ((new_op = mapping.get(op)))
8913             gimple_set_op (stmt, j, *new_op);
8914           else
8915             {
8916               /* PR92429: The last argument of simplify_replace_tree disables
8917                  folding when replacing arguments.  This is required as
8918                  otherwise you might end up with different statements than the
8919                  ones analyzed in vect_loop_analyze, leading to different
8920                  vectorization.  */
8921               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8922                                           &find_in_mapping, &mapping, false);
8923               gimple_set_op (stmt, j, op);
8924             }
8925         }
8926     }
8927
8928   struct data_reference *dr;
8929   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8930   FOR_EACH_VEC_ELT (datarefs, i, dr)
8931     {
8932       orig_stmt = DR_STMT (dr);
8933       gcc_assert (gimple_uid (orig_stmt) > 0);
8934       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8935       /* Data references for gather loads and scatter stores do not use the
8936          updated offset we set using ADVANCE.  Instead we have to make sure the
8937          reference in the data references point to the corresponding copy of
8938          the original in the epilogue.  */
8939       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8940           == VMAT_GATHER_SCATTER)
8941         {
8942           DR_REF (dr)
8943             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8944                                      &find_in_mapping, &mapping);
8945           DR_BASE_ADDRESS (dr)
8946             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8947                                      &find_in_mapping, &mapping);
8948         }
8949       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8950       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8951       /* The vector size of the epilogue is smaller than that of the main loop
8952          so the alignment is either the same or lower. This means the dr will
8953          thus by definition be aligned.  */
8954       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8955     }
8956
8957   epilogue_vinfo->shared->datarefs_copy.release ();
8958   epilogue_vinfo->shared->save_datarefs ();
8959 }
8960
8961 /* Function vect_transform_loop.
8962
8963    The analysis phase has determined that the loop is vectorizable.
8964    Vectorize the loop - created vectorized stmts to replace the scalar
8965    stmts in the loop, and update the loop exit condition.
8966    Returns scalar epilogue loop if any.  */
8967
8968 class loop *
8969 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8970 {
8971   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8972   class loop *epilogue = NULL;
8973   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8974   int nbbs = loop->num_nodes;
8975   int i;
8976   tree niters_vector = NULL_TREE;
8977   tree step_vector = NULL_TREE;
8978   tree niters_vector_mult_vf = NULL_TREE;
8979   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8980   unsigned int lowest_vf = constant_lower_bound (vf);
8981   gimple *stmt;
8982   bool check_profitability = false;
8983   unsigned int th;
8984
8985   DUMP_VECT_SCOPE ("vec_transform_loop");
8986
8987   loop_vinfo->shared->check_datarefs ();
8988
8989   /* Use the more conservative vectorization threshold.  If the number
8990      of iterations is constant assume the cost check has been performed
8991      by our caller.  If the threshold makes all loops profitable that
8992      run at least the (estimated) vectorization factor number of times
8993      checking is pointless, too.  */
8994   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8995   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8996     {
8997       if (dump_enabled_p ())
8998         dump_printf_loc (MSG_NOTE, vect_location,
8999                          "Profitability threshold is %d loop iterations.\n",
9000                          th);
9001       check_profitability = true;
9002     }
9003
9004   /* Make sure there exists a single-predecessor exit bb.  Do this before
9005      versioning.   */
9006   edge e = single_exit (loop);
9007   if (! single_pred_p (e->dest))
9008     {
9009       split_loop_exit_edge (e, true);
9010       if (dump_enabled_p ())
9011         dump_printf (MSG_NOTE, "split exit edge\n");
9012     }
9013
9014   /* Version the loop first, if required, so the profitability check
9015      comes first.  */
9016
9017   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9018     {
9019       class loop *sloop
9020         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9021       sloop->force_vectorize = false;
9022       check_profitability = false;
9023     }
9024
9025   /* Make sure there exists a single-predecessor exit bb also on the
9026      scalar loop copy.  Do this after versioning but before peeling
9027      so CFG structure is fine for both scalar and if-converted loop
9028      to make slpeel_duplicate_current_defs_from_edges face matched
9029      loop closed PHI nodes on the exit.  */
9030   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9031     {
9032       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9033       if (! single_pred_p (e->dest))
9034         {
9035           split_loop_exit_edge (e, true);
9036           if (dump_enabled_p ())
9037             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9038         }
9039     }
9040
9041   tree niters = vect_build_loop_niters (loop_vinfo);
9042   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9043   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9044   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9045   tree advance;
9046   drs_init_vec orig_drs_init;
9047
9048   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9049                               &step_vector, &niters_vector_mult_vf, th,
9050                               check_profitability, niters_no_overflow,
9051                               &advance);
9052
9053   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9054       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9055     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9056                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9057
9058   if (niters_vector == NULL_TREE)
9059     {
9060       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9061           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9062           && known_eq (lowest_vf, vf))
9063         {
9064           niters_vector
9065             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9066                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9067           step_vector = build_one_cst (TREE_TYPE (niters));
9068         }
9069       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9070         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9071                                      &step_vector, niters_no_overflow);
9072       else
9073         /* vect_do_peeling subtracted the number of peeled prologue
9074            iterations from LOOP_VINFO_NITERS.  */
9075         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9076                                      &niters_vector, &step_vector,
9077                                      niters_no_overflow);
9078     }
9079
9080   /* 1) Make sure the loop header has exactly two entries
9081      2) Make sure we have a preheader basic block.  */
9082
9083   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9084
9085   split_edge (loop_preheader_edge (loop));
9086
9087   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9088     /* This will deal with any possible peeling.  */
9089     vect_prepare_for_masked_peels (loop_vinfo);
9090
9091   /* Schedule the SLP instances first, then handle loop vectorization
9092      below.  */
9093   if (!loop_vinfo->slp_instances.is_empty ())
9094     {
9095       DUMP_VECT_SCOPE ("scheduling SLP instances");
9096       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9097     }
9098
9099   /* FORNOW: the vectorizer supports only loops which body consist
9100      of one basic block (header + empty latch). When the vectorizer will
9101      support more involved loop forms, the order by which the BBs are
9102      traversed need to be reconsidered.  */
9103
9104   for (i = 0; i < nbbs; i++)
9105     {
9106       basic_block bb = bbs[i];
9107       stmt_vec_info stmt_info;
9108
9109       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9110            gsi_next (&si))
9111         {
9112           gphi *phi = si.phi ();
9113           if (dump_enabled_p ())
9114             dump_printf_loc (MSG_NOTE, vect_location,
9115                              "------>vectorizing phi: %G", phi);
9116           stmt_info = loop_vinfo->lookup_stmt (phi);
9117           if (!stmt_info)
9118             continue;
9119
9120           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9121             vect_loop_kill_debug_uses (loop, stmt_info);
9122
9123           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9124               && !STMT_VINFO_LIVE_P (stmt_info))
9125             continue;
9126
9127           if (STMT_VINFO_VECTYPE (stmt_info)
9128               && (maybe_ne
9129                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9130               && dump_enabled_p ())
9131             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9132
9133           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9134                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9135                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9136                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9137                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9138               && ! PURE_SLP_STMT (stmt_info))
9139             {
9140               if (dump_enabled_p ())
9141                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9142               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9143             }
9144         }
9145
9146       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9147            gsi_next (&si))
9148         {
9149           gphi *phi = si.phi ();
9150           stmt_info = loop_vinfo->lookup_stmt (phi);
9151           if (!stmt_info)
9152             continue;
9153
9154           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9155               && !STMT_VINFO_LIVE_P (stmt_info))
9156             continue;
9157
9158           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9159                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9160                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9161                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9162                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9163               && ! PURE_SLP_STMT (stmt_info))
9164             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9165         }
9166
9167       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9168            !gsi_end_p (si);)
9169         {
9170           stmt = gsi_stmt (si);
9171           /* During vectorization remove existing clobber stmts.  */
9172           if (gimple_clobber_p (stmt))
9173             {
9174               unlink_stmt_vdef (stmt);
9175               gsi_remove (&si, true);
9176               release_defs (stmt);
9177             }
9178           else
9179             {
9180               /* Ignore vector stmts created in the outer loop.  */
9181               stmt_info = loop_vinfo->lookup_stmt (stmt);
9182
9183               /* vector stmts created in the outer-loop during vectorization of
9184                  stmts in an inner-loop may not have a stmt_info, and do not
9185                  need to be vectorized.  */
9186               stmt_vec_info seen_store = NULL;
9187               if (stmt_info)
9188                 {
9189                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9190                     {
9191                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9192                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9193                            !gsi_end_p (subsi); gsi_next (&subsi))
9194                         {
9195                           stmt_vec_info pat_stmt_info
9196                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9197                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9198                                                     &si, &seen_store);
9199                         }
9200                       stmt_vec_info pat_stmt_info
9201                         = STMT_VINFO_RELATED_STMT (stmt_info);
9202                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9203                                                 &seen_store);
9204                       maybe_set_vectorized_backedge_value (loop_vinfo,
9205                                                            pat_stmt_info);
9206                     }
9207                   else
9208                     {
9209                       vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9210                                                 &seen_store);
9211                       maybe_set_vectorized_backedge_value (loop_vinfo,
9212                                                            stmt_info);
9213                     }
9214                 }
9215               gsi_next (&si);
9216               if (seen_store)
9217                 {
9218                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9219                     /* Interleaving.  If IS_STORE is TRUE, the
9220                        vectorization of the interleaving chain was
9221                        completed - free all the stores in the chain.  */
9222                     vect_remove_stores (loop_vinfo,
9223                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9224                   else
9225                     /* Free the attached stmt_vec_info and remove the stmt.  */
9226                     loop_vinfo->remove_stmt (stmt_info);
9227                 }
9228             }
9229         }
9230
9231       /* Stub out scalar statements that must not survive vectorization.
9232          Doing this here helps with grouped statements, or statements that
9233          are involved in patterns.  */
9234       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9235            !gsi_end_p (gsi); gsi_next (&gsi))
9236         {
9237           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9238           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9239             {
9240               tree lhs = gimple_get_lhs (call);
9241               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9242                 {
9243                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9244                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9245                   gsi_replace (&gsi, new_stmt, true);
9246                 }
9247             }
9248         }
9249     }                           /* BBs in loop */
9250
9251   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9252      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9253   if (integer_onep (step_vector))
9254     niters_no_overflow = true;
9255   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9256                            niters_vector_mult_vf, !niters_no_overflow);
9257
9258   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9259   scale_profile_for_vect_loop (loop, assumed_vf);
9260
9261   /* True if the final iteration might not handle a full vector's
9262      worth of scalar iterations.  */
9263   bool final_iter_may_be_partial
9264     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9265   /* The minimum number of iterations performed by the epilogue.  This
9266      is 1 when peeling for gaps because we always need a final scalar
9267      iteration.  */
9268   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9269   /* +1 to convert latch counts to loop iteration counts,
9270      -min_epilogue_iters to remove iterations that cannot be performed
9271        by the vector code.  */
9272   int bias_for_lowest = 1 - min_epilogue_iters;
9273   int bias_for_assumed = bias_for_lowest;
9274   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9275   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9276     {
9277       /* When the amount of peeling is known at compile time, the first
9278          iteration will have exactly alignment_npeels active elements.
9279          In the worst case it will have at least one.  */
9280       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9281       bias_for_lowest += lowest_vf - min_first_active;
9282       bias_for_assumed += assumed_vf - min_first_active;
9283     }
9284   /* In these calculations the "- 1" converts loop iteration counts
9285      back to latch counts.  */
9286   if (loop->any_upper_bound)
9287     loop->nb_iterations_upper_bound
9288       = (final_iter_may_be_partial
9289          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9290                           lowest_vf) - 1
9291          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9292                            lowest_vf) - 1);
9293   if (loop->any_likely_upper_bound)
9294     loop->nb_iterations_likely_upper_bound
9295       = (final_iter_may_be_partial
9296          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9297                           + bias_for_lowest, lowest_vf) - 1
9298          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9299                            + bias_for_lowest, lowest_vf) - 1);
9300   if (loop->any_estimate)
9301     loop->nb_iterations_estimate
9302       = (final_iter_may_be_partial
9303          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9304                           assumed_vf) - 1
9305          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9306                            assumed_vf) - 1);
9307
9308   if (dump_enabled_p ())
9309     {
9310       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9311         {
9312           dump_printf_loc (MSG_NOTE, vect_location,
9313                            "LOOP VECTORIZED\n");
9314           if (loop->inner)
9315             dump_printf_loc (MSG_NOTE, vect_location,
9316                              "OUTER LOOP VECTORIZED\n");
9317           dump_printf (MSG_NOTE, "\n");
9318         }
9319       else
9320         dump_printf_loc (MSG_NOTE, vect_location,
9321                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9322                          GET_MODE_NAME (loop_vinfo->vector_mode));
9323     }
9324
9325   /* Loops vectorized with a variable factor won't benefit from
9326      unrolling/peeling.  */
9327   if (!vf.is_constant ())
9328     {
9329       loop->unroll = 1;
9330       if (dump_enabled_p ())
9331         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9332                          " variable-length vectorization factor\n");
9333     }
9334   /* Free SLP instances here because otherwise stmt reference counting
9335      won't work.  */
9336   slp_instance instance;
9337   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9338     vect_free_slp_instance (instance);
9339   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9340   /* Clear-up safelen field since its value is invalid after vectorization
9341      since vectorized loop can have loop-carried dependencies.  */
9342   loop->safelen = 0;
9343
9344   if (epilogue)
9345     {
9346       update_epilogue_loop_vinfo (epilogue, advance);
9347
9348       epilogue->simduid = loop->simduid;
9349       epilogue->force_vectorize = loop->force_vectorize;
9350       epilogue->dont_vectorize = false;
9351     }
9352
9353   return epilogue;
9354 }
9355
9356 /* The code below is trying to perform simple optimization - revert
9357    if-conversion for masked stores, i.e. if the mask of a store is zero
9358    do not perform it and all stored value producers also if possible.
9359    For example,
9360      for (i=0; i<n; i++)
9361        if (c[i])
9362         {
9363           p1[i] += 1;
9364           p2[i] = p3[i] +2;
9365         }
9366    this transformation will produce the following semi-hammock:
9367
9368    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9369      {
9370        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9371        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9372        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9373        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9374        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9375        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9376      }
9377 */
9378
9379 void
9380 optimize_mask_stores (class loop *loop)
9381 {
9382   basic_block *bbs = get_loop_body (loop);
9383   unsigned nbbs = loop->num_nodes;
9384   unsigned i;
9385   basic_block bb;
9386   class loop *bb_loop;
9387   gimple_stmt_iterator gsi;
9388   gimple *stmt;
9389   auto_vec<gimple *> worklist;
9390   auto_purge_vect_location sentinel;
9391
9392   vect_location = find_loop_location (loop);
9393   /* Pick up all masked stores in loop if any.  */
9394   for (i = 0; i < nbbs; i++)
9395     {
9396       bb = bbs[i];
9397       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9398            gsi_next (&gsi))
9399         {
9400           stmt = gsi_stmt (gsi);
9401           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9402             worklist.safe_push (stmt);
9403         }
9404     }
9405
9406   free (bbs);
9407   if (worklist.is_empty ())
9408     return;
9409
9410   /* Loop has masked stores.  */
9411   while (!worklist.is_empty ())
9412     {
9413       gimple *last, *last_store;
9414       edge e, efalse;
9415       tree mask;
9416       basic_block store_bb, join_bb;
9417       gimple_stmt_iterator gsi_to;
9418       tree vdef, new_vdef;
9419       gphi *phi;
9420       tree vectype;
9421       tree zero;
9422
9423       last = worklist.pop ();
9424       mask = gimple_call_arg (last, 2);
9425       bb = gimple_bb (last);
9426       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9427          the same loop as if_bb.  It could be different to LOOP when two
9428          level loop-nest is vectorized and mask_store belongs to the inner
9429          one.  */
9430       e = split_block (bb, last);
9431       bb_loop = bb->loop_father;
9432       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9433       join_bb = e->dest;
9434       store_bb = create_empty_bb (bb);
9435       add_bb_to_loop (store_bb, bb_loop);
9436       e->flags = EDGE_TRUE_VALUE;
9437       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9438       /* Put STORE_BB to likely part.  */
9439       efalse->probability = profile_probability::unlikely ();
9440       store_bb->count = efalse->count ();
9441       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9442       if (dom_info_available_p (CDI_DOMINATORS))
9443         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9444       if (dump_enabled_p ())
9445         dump_printf_loc (MSG_NOTE, vect_location,
9446                          "Create new block %d to sink mask stores.",
9447                          store_bb->index);
9448       /* Create vector comparison with boolean result.  */
9449       vectype = TREE_TYPE (mask);
9450       zero = build_zero_cst (vectype);
9451       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9452       gsi = gsi_last_bb (bb);
9453       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9454       /* Create new PHI node for vdef of the last masked store:
9455          .MEM_2 = VDEF <.MEM_1>
9456          will be converted to
9457          .MEM.3 = VDEF <.MEM_1>
9458          and new PHI node will be created in join bb
9459          .MEM_2 = PHI <.MEM_1, .MEM_3>
9460       */
9461       vdef = gimple_vdef (last);
9462       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9463       gimple_set_vdef (last, new_vdef);
9464       phi = create_phi_node (vdef, join_bb);
9465       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9466
9467       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9468       while (true)
9469         {
9470           gimple_stmt_iterator gsi_from;
9471           gimple *stmt1 = NULL;
9472
9473           /* Move masked store to STORE_BB.  */
9474           last_store = last;
9475           gsi = gsi_for_stmt (last);
9476           gsi_from = gsi;
9477           /* Shift GSI to the previous stmt for further traversal.  */
9478           gsi_prev (&gsi);
9479           gsi_to = gsi_start_bb (store_bb);
9480           gsi_move_before (&gsi_from, &gsi_to);
9481           /* Setup GSI_TO to the non-empty block start.  */
9482           gsi_to = gsi_start_bb (store_bb);
9483           if (dump_enabled_p ())
9484             dump_printf_loc (MSG_NOTE, vect_location,
9485                              "Move stmt to created bb\n%G", last);
9486           /* Move all stored value producers if possible.  */
9487           while (!gsi_end_p (gsi))
9488             {
9489               tree lhs;
9490               imm_use_iterator imm_iter;
9491               use_operand_p use_p;
9492               bool res;
9493
9494               /* Skip debug statements.  */
9495               if (is_gimple_debug (gsi_stmt (gsi)))
9496                 {
9497                   gsi_prev (&gsi);
9498                   continue;
9499                 }
9500               stmt1 = gsi_stmt (gsi);
9501               /* Do not consider statements writing to memory or having
9502                  volatile operand.  */
9503               if (gimple_vdef (stmt1)
9504                   || gimple_has_volatile_ops (stmt1))
9505                 break;
9506               gsi_from = gsi;
9507               gsi_prev (&gsi);
9508               lhs = gimple_get_lhs (stmt1);
9509               if (!lhs)
9510                 break;
9511
9512               /* LHS of vectorized stmt must be SSA_NAME.  */
9513               if (TREE_CODE (lhs) != SSA_NAME)
9514                 break;
9515
9516               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9517                 {
9518                   /* Remove dead scalar statement.  */
9519                   if (has_zero_uses (lhs))
9520                     {
9521                       gsi_remove (&gsi_from, true);
9522                       continue;
9523                     }
9524                 }
9525
9526               /* Check that LHS does not have uses outside of STORE_BB.  */
9527               res = true;
9528               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9529                 {
9530                   gimple *use_stmt;
9531                   use_stmt = USE_STMT (use_p);
9532                   if (is_gimple_debug (use_stmt))
9533                     continue;
9534                   if (gimple_bb (use_stmt) != store_bb)
9535                     {
9536                       res = false;
9537                       break;
9538                     }
9539                 }
9540               if (!res)
9541                 break;
9542
9543               if (gimple_vuse (stmt1)
9544                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9545                 break;
9546
9547               /* Can move STMT1 to STORE_BB.  */
9548               if (dump_enabled_p ())
9549                 dump_printf_loc (MSG_NOTE, vect_location,
9550                                  "Move stmt to created bb\n%G", stmt1);
9551               gsi_move_before (&gsi_from, &gsi_to);
9552               /* Shift GSI_TO for further insertion.  */
9553               gsi_prev (&gsi_to);
9554             }
9555           /* Put other masked stores with the same mask to STORE_BB.  */
9556           if (worklist.is_empty ()
9557               || gimple_call_arg (worklist.last (), 2) != mask
9558               || worklist.last () != stmt1)
9559             break;
9560           last = worklist.pop ();
9561         }
9562       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9563     }
9564 }
9565
9566 /* Decide whether it is possible to use a zero-based induction variable
9567    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9568    the value that the induction variable must be able to hold in order
9569    to ensure that the rgroups eventually have no active vector elements.
9570    Return -1 otherwise.  */
9571
9572 widest_int
9573 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9574 {
9575   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9576   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9577   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9578
9579   /* Calculate the value that the induction variable must be able
9580      to hit in order to ensure that we end the loop with an all-false mask.
9581      This involves adding the maximum number of inactive trailing scalar
9582      iterations.  */
9583   widest_int iv_limit = -1;
9584   if (max_loop_iterations (loop, &iv_limit))
9585     {
9586       if (niters_skip)
9587         {
9588           /* Add the maximum number of skipped iterations to the
9589              maximum iteration count.  */
9590           if (TREE_CODE (niters_skip) == INTEGER_CST)
9591             iv_limit += wi::to_widest (niters_skip);
9592           else
9593             iv_limit += max_vf - 1;
9594         }
9595       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9596         /* Make a conservatively-correct assumption.  */
9597         iv_limit += max_vf - 1;
9598
9599       /* IV_LIMIT is the maximum number of latch iterations, which is also
9600          the maximum in-range IV value.  Round this value down to the previous
9601          vector alignment boundary and then add an extra full iteration.  */
9602       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9603       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9604     }
9605   return iv_limit;
9606 }
9607
9608 /* For the given rgroup_controls RGC, check whether an induction variable
9609    would ever hit a value that produces a set of all-false masks or zero
9610    lengths before wrapping around.  Return true if it's possible to wrap
9611    around before hitting the desirable value, otherwise return false.  */
9612
9613 bool
9614 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9615 {
9616   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9617
9618   if (iv_limit == -1)
9619     return true;
9620
9621   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9622   unsigned int compare_precision = TYPE_PRECISION (compare_type);
9623   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9624
9625   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9626     return true;
9627
9628   return false;
9629 }