gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     {
 670       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 671       while (next)
 672         {
 673           if ((STMT_VINFO_IN_PATTERN_P (next)
 674                != STMT_VINFO_IN_PATTERN_P (first))
 675               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 676             break;
 677           next = REDUC_GROUP_NEXT_ELEMENT (next);
 678         }
 679       /* If all reduction chain members are well-formed patterns adjust
 680          the group to group the pattern stmts instead.  */
 681       if (! next
 682           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 683         {
 684           if (STMT_VINFO_IN_PATTERN_P (first))
 685             {
 686               vect_fixup_reduc_chain (first);
 687               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 688                 = STMT_VINFO_RELATED_STMT (first);
 689             }
 690         }
 691       /* If not all stmt in the chain are patterns or if we failed
 692          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 693          it as regular reduction instead.  */
 694       else
 695         {
 696           stmt_vec_info vinfo = first;
 697           stmt_vec_info last = NULL;
 698           while (vinfo)
 699             {
 700               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 701               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 702               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 703               last = vinfo;
 704               vinfo = next;
 705             }
 706           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 707             = vect_internal_def;
 708           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 709           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 710           --i;
 711         }
 712     }
 713 }
 714
 715 /* Function vect_get_loop_niters.
 716
 717    Determine how many iterations the loop is executed and place it
 718    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 719    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 720    niter information holds in ASSUMPTIONS.
 721
 722    Return the loop exit condition.  */
 723
 724
 725 static gcond *
 726 vect_get_loop_niters (class loop *loop, tree *assumptions,
 727                       tree *number_of_iterations, tree *number_of_iterationsm1)
 728 {
 729   edge exit = single_exit (loop);
 730   class tree_niter_desc niter_desc;
 731   tree niter_assumptions, niter, may_be_zero;
 732   gcond *cond = get_loop_exit_condition (loop);
 733
 734   *assumptions = boolean_true_node;
 735   *number_of_iterationsm1 = chrec_dont_know;
 736   *number_of_iterations = chrec_dont_know;
 737   DUMP_VECT_SCOPE ("get_loop_niters");
 738
 739   if (!exit)
 740     return cond;
 741
 742   may_be_zero = NULL_TREE;
 743   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 744       || chrec_contains_undetermined (niter_desc.niter))
 745     return cond;
 746
 747   niter_assumptions = niter_desc.assumptions;
 748   may_be_zero = niter_desc.may_be_zero;
 749   niter = niter_desc.niter;
 750
 751   if (may_be_zero && integer_zerop (may_be_zero))
 752     may_be_zero = NULL_TREE;
 753
 754   if (may_be_zero)
 755     {
 756       if (COMPARISON_CLASS_P (may_be_zero))
 757         {
 758           /* Try to combine may_be_zero with assumptions, this can simplify
 759              computation of niter expression.  */
 760           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 761             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 762                                              niter_assumptions,
 763                                              fold_build1 (TRUTH_NOT_EXPR,
 764                                                           boolean_type_node,
 765                                                           may_be_zero));
 766           else
 767             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 768                                  build_int_cst (TREE_TYPE (niter), 0),
 769                                  rewrite_to_non_trapping_overflow (niter));
 770
 771           may_be_zero = NULL_TREE;
 772         }
 773       else if (integer_nonzerop (may_be_zero))
 774         {
 775           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 776           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 777           return cond;
 778         }
 779       else
 780         return cond;
 781     }
 782
 783   *assumptions = niter_assumptions;
 784   *number_of_iterationsm1 = niter;
 785
 786   /* We want the number of loop header executions which is the number
 787      of latch executions plus one.
 788      ???  For UINT_MAX latch executions this number overflows to zero
 789      for loops like do { n++; } while (n != 0);  */
 790   if (niter && !chrec_contains_undetermined (niter))
 791     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 792                           build_int_cst (TREE_TYPE (niter), 1));
 793   *number_of_iterations = niter;
 794
 795   return cond;
 796 }
 797
 798 /* Function bb_in_loop_p
 799
 800    Used as predicate for dfs order traversal of the loop bbs.  */
 801
 802 static bool
 803 bb_in_loop_p (const_basic_block bb, const void *data)
 804 {
 805   const class loop *const loop = (const class loop *)data;
 806   if (flow_bb_inside_loop_p (loop, bb))
 807     return true;
 808   return false;
 809 }
 810
 811
 812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 813    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 814
 815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 816   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 817     loop (loop_in),
 818     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 819     num_itersm1 (NULL_TREE),
 820     num_iters (NULL_TREE),
 821     num_iters_unchanged (NULL_TREE),
 822     num_iters_assumptions (NULL_TREE),
 823     th (0),
 824     versioning_threshold (0),
 825     vectorization_factor (0),
 826     max_vectorization_factor (0),
 827     mask_skip_niters (NULL_TREE),
 828     rgroup_compare_type (NULL_TREE),
 829     simd_if_cond (NULL_TREE),
 830     unaligned_dr (NULL),
 831     peeling_for_alignment (0),
 832     ptr_mask (0),
 833     ivexpr_map (NULL),
 834     scan_map (NULL),
 835     slp_unrolling_factor (1),
 836     single_scalar_iteration_cost (0),
 837     vec_outside_cost (0),
 838     vec_inside_cost (0),
 839     vectorizable (false),
 840     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 841     using_partial_vectors_p (false),
 842     epil_using_partial_vectors_p (false),
 843     peeling_for_gaps (false),
 844     peeling_for_niter (false),
 845     no_data_dependencies (false),
 846     has_mask_store (false),
 847     scalar_loop_scaling (profile_probability::uninitialized ()),
 848     scalar_loop (NULL),
 849     orig_loop_info (NULL)
 850 {
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 857                                           bbs, loop->num_nodes, loop);
 858   gcc_assert (nbbs == loop->num_nodes);
 859
 860   for (unsigned int i = 0; i < nbbs; i++)
 861     {
 862       basic_block bb = bbs[i];
 863       gimple_stmt_iterator si;
 864
 865       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 866         {
 867           gimple *phi = gsi_stmt (si);
 868           gimple_set_uid (phi, 0);
 869           add_stmt (phi);
 870         }
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 873         {
 874           gimple *stmt = gsi_stmt (si);
 875           gimple_set_uid (stmt, 0);
 876           if (is_gimple_debug (stmt))
 877             continue;
 878           add_stmt (stmt);
 879           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 880              third argument is the #pragma omp simd if (x) condition, when 0,
 881              loop shouldn't be vectorized, when non-zero constant, it should
 882              be vectorized normally, otherwise versioned with vectorized loop
 883              done if the condition is non-zero at runtime.  */
 884           if (loop_in->simduid
 885               && is_gimple_call (stmt)
 886               && gimple_call_internal_p (stmt)
 887               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 888               && gimple_call_num_args (stmt) >= 3
 889               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 890               && (loop_in->simduid
 891                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 892             {
 893               tree arg = gimple_call_arg (stmt, 2);
 894               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 895                 simd_if_cond = arg;
 896               else
 897                 gcc_assert (integer_nonzerop (arg));
 898             }
 899         }
 900     }
 901
 902   epilogue_vinfos.create (6);
 903 }
 904
 905 /* Free all levels of rgroup CONTROLS.  */
 906
 907 void
 908 release_vec_loop_controls (vec<rgroup_controls> *controls)
 909 {
 910   rgroup_controls *rgc;
 911   unsigned int i;
 912   FOR_EACH_VEC_ELT (*controls, i, rgc)
 913     rgc->controls.release ();
 914   controls->release ();
 915 }
 916
 917 /* Free all memory used by the _loop_vec_info, as well as all the
 918    stmt_vec_info structs of all the stmts in the loop.  */
 919
 920 _loop_vec_info::~_loop_vec_info ()
 921 {
 922   free (bbs);
 923
 924   release_vec_loop_controls (&masks);
 925   release_vec_loop_controls (&lens);
 926   delete ivexpr_map;
 927   delete scan_map;
 928   epilogue_vinfos.release ();
 929
 930   /* When we release an epiloge vinfo that we do not intend to use
 931      avoid clearing AUX of the main loop which should continue to
 932      point to the main loop vinfo since otherwise we'll leak that.  */
 933   if (loop->aux == this)
 934     loop->aux = NULL;
 935 }
 936
 937 /* Return an invariant or register for EXPR and emit necessary
 938    computations in the LOOP_VINFO loop preheader.  */
 939
 940 tree
 941 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 942 {
 943   if (is_gimple_reg (expr)
 944       || is_gimple_min_invariant (expr))
 945     return expr;
 946
 947   if (! loop_vinfo->ivexpr_map)
 948     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 949   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 950   if (! cached)
 951     {
 952       gimple_seq stmts = NULL;
 953       cached = force_gimple_operand (unshare_expr (expr),
 954                                      &stmts, true, NULL_TREE);
 955       if (stmts)
 956         {
 957           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 958           gsi_insert_seq_on_edge_immediate (e, stmts);
 959         }
 960     }
 961   return cached;
 962 }
 963
 964 /* Return true if we can use CMP_TYPE as the comparison type to produce
 965    all masks required to mask LOOP_VINFO.  */
 966
 967 static bool
 968 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 969 {
 970   rgroup_controls *rgm;
 971   unsigned int i;
 972   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 973     if (rgm->type != NULL_TREE
 974         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 975                                             cmp_type, rgm->type,
 976                                             OPTIMIZE_FOR_SPEED))
 977       return false;
 978   return true;
 979 }
 980
 981 /* Calculate the maximum number of scalars per iteration for every
 982    rgroup in LOOP_VINFO.  */
 983
 984 static unsigned int
 985 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 986 {
 987   unsigned int res = 1;
 988   unsigned int i;
 989   rgroup_controls *rgm;
 990   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 991     res = MAX (res, rgm->max_nscalars_per_iter);
 992   return res;
 993 }
 994
 995 /* Calculate the minimum precision necessary to represent:
 996
 997       MAX_NITERS * FACTOR
 998
 999    as an unsigned integer, where MAX_NITERS is the maximum number of
1000    loop header iterations for the original scalar form of LOOP_VINFO.  */
1001
1002 static unsigned
1003 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1004 {
1005   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1006
1007   /* Get the maximum number of iterations that is representable
1008      in the counter type.  */
1009   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1010   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1011
1012   /* Get a more refined estimate for the number of iterations.  */
1013   widest_int max_back_edges;
1014   if (max_loop_iterations (loop, &max_back_edges))
1015     max_ni = wi::smin (max_ni, max_back_edges + 1);
1016
1017   /* Work out how many bits we need to represent the limit.  */
1018   return wi::min_precision (max_ni * factor, UNSIGNED);
1019 }
1020
1021 /* True if the loop needs peeling or partial vectors when vectorized.  */
1022
1023 static bool
1024 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1025 {
1026   unsigned HOST_WIDE_INT const_vf;
1027   HOST_WIDE_INT max_niter
1028     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1029
1030   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1031   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1032     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1033                                           (loop_vinfo));
1034
1035   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1036       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1037     {
1038       /* Work out the (constant) number of iterations that need to be
1039          peeled for reasons other than niters.  */
1040       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1041       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1042         peel_niter += 1;
1043       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1044                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1045         return true;
1046     }
1047   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1048       /* ??? When peeling for gaps but not alignment, we could
1049          try to check whether the (variable) niters is known to be
1050          VF * N + 1.  That's something of a niche case though.  */
1051       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1052       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1053       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1054            < (unsigned) exact_log2 (const_vf))
1055           /* In case of versioning, check if the maximum number of
1056              iterations is greater than th.  If they are identical,
1057              the epilogue is unnecessary.  */
1058           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1059               || ((unsigned HOST_WIDE_INT) max_niter
1060                   > (th / const_vf) * const_vf))))
1061     return true;
1062
1063   return false;
1064 }
1065
1066 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1067    whether we can actually generate the masks required.  Return true if so,
1068    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1069
1070 static bool
1071 vect_verify_full_masking (loop_vec_info loop_vinfo)
1072 {
1073   unsigned int min_ni_width;
1074   unsigned int max_nscalars_per_iter
1075     = vect_get_max_nscalars_per_iter (loop_vinfo);
1076
1077   /* Use a normal loop if there are no statements that need masking.
1078      This only happens in rare degenerate cases: it means that the loop
1079      has no loads, no stores, and no live-out values.  */
1080   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1081     return false;
1082
1083   /* Work out how many bits we need to represent the limit.  */
1084   min_ni_width
1085     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1086
1087   /* Find a scalar mode for which WHILE_ULT is supported.  */
1088   opt_scalar_int_mode cmp_mode_iter;
1089   tree cmp_type = NULL_TREE;
1090   tree iv_type = NULL_TREE;
1091   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1092   unsigned int iv_precision = UINT_MAX;
1093
1094   if (iv_limit != -1)
1095     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1096                                       UNSIGNED);
1097
1098   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1099     {
1100       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1101       if (cmp_bits >= min_ni_width
1102           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1103         {
1104           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1105           if (this_type
1106               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1107             {
1108               /* Although we could stop as soon as we find a valid mode,
1109                  there are at least two reasons why that's not always the
1110                  best choice:
1111
1112                  - An IV that's Pmode or wider is more likely to be reusable
1113                    in address calculations than an IV that's narrower than
1114                    Pmode.
1115
1116                  - Doing the comparison in IV_PRECISION or wider allows
1117                    a natural 0-based IV, whereas using a narrower comparison
1118                    type requires mitigations against wrap-around.
1119
1120                  Conversely, if the IV limit is variable, doing the comparison
1121                  in a wider type than the original type can introduce
1122                  unnecessary extensions, so picking the widest valid mode
1123                  is not always a good choice either.
1124
1125                  Here we prefer the first IV type that's Pmode or wider,
1126                  and the first comparison type that's IV_PRECISION or wider.
1127                  (The comparison type must be no wider than the IV type,
1128                  to avoid extensions in the vector loop.)
1129
1130                  ??? We might want to try continuing beyond Pmode for ILP32
1131                  targets if CMP_BITS < IV_PRECISION.  */
1132               iv_type = this_type;
1133               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1134                 cmp_type = this_type;
1135               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1136                 break;
1137             }
1138         }
1139     }
1140
1141   if (!cmp_type)
1142     return false;
1143
1144   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1145   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1146   return true;
1147 }
1148
1149 /* Check whether we can use vector access with length based on precison
1150    comparison.  So far, to keep it simple, we only allow the case that the
1151    precision of the target supported length is larger than the precision
1152    required by loop niters.  */
1153
1154 static bool
1155 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1156 {
1157   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1158     return false;
1159
1160   unsigned int max_nitems_per_iter = 1;
1161   unsigned int i;
1162   rgroup_controls *rgl;
1163   /* Find the maximum number of items per iteration for every rgroup.  */
1164   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1165     {
1166       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1167       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1168     }
1169
1170   /* Work out how many bits we need to represent the length limit.  */
1171   unsigned int min_ni_prec
1172     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1173
1174   /* Now use the maximum of below precisions for one suitable IV type:
1175      - the IV's natural precision
1176      - the precision needed to hold: the maximum number of scalar
1177        iterations multiplied by the scale factor (min_ni_prec above)
1178      - the Pmode precision
1179
1180      If min_ni_prec is less than the precision of the current niters,
1181      we perfer to still use the niters type.  Prefer to use Pmode and
1182      wider IV to avoid narrow conversions.  */
1183
1184   unsigned int ni_prec
1185     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1186   min_ni_prec = MAX (min_ni_prec, ni_prec);
1187   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1188
1189   tree iv_type = NULL_TREE;
1190   opt_scalar_int_mode tmode_iter;
1191   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1192     {
1193       scalar_mode tmode = tmode_iter.require ();
1194       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1195
1196       /* ??? Do we really want to construct one IV whose precision exceeds
1197          BITS_PER_WORD?  */
1198       if (tbits > BITS_PER_WORD)
1199         break;
1200
1201       /* Find the first available standard integral type.  */
1202       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1203         {
1204           iv_type = build_nonstandard_integer_type (tbits, true);
1205           break;
1206         }
1207     }
1208
1209   if (!iv_type)
1210     {
1211       if (dump_enabled_p ())
1212         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213                          "can't vectorize with length-based partial vectors"
1214                          " because there is no suitable iv type.\n");
1215       return false;
1216     }
1217
1218   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1219   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1220
1221   return true;
1222 }
1223
1224 /* Calculate the cost of one scalar iteration of the loop.  */
1225 static void
1226 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1227 {
1228   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1229   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1230   int nbbs = loop->num_nodes, factor;
1231   int innerloop_iters, i;
1232
1233   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1234
1235   /* Gather costs for statements in the scalar loop.  */
1236
1237   /* FORNOW.  */
1238   innerloop_iters = 1;
1239   if (loop->inner)
1240     innerloop_iters = 50; /* FIXME */
1241
1242   for (i = 0; i < nbbs; i++)
1243     {
1244       gimple_stmt_iterator si;
1245       basic_block bb = bbs[i];
1246
1247       if (bb->loop_father == loop->inner)
1248         factor = innerloop_iters;
1249       else
1250         factor = 1;
1251
1252       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1253         {
1254           gimple *stmt = gsi_stmt (si);
1255           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1256
1257           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1258             continue;
1259
1260           /* Skip stmts that are not vectorized inside the loop.  */
1261           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1262           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1263               && (!STMT_VINFO_LIVE_P (vstmt_info)
1264                   || !VECTORIZABLE_CYCLE_DEF
1265                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1266             continue;
1267
1268           vect_cost_for_stmt kind;
1269           if (STMT_VINFO_DATA_REF (stmt_info))
1270             {
1271               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1272                kind = scalar_load;
1273              else
1274                kind = scalar_store;
1275             }
1276           else if (vect_nop_conversion_p (stmt_info))
1277             continue;
1278           else
1279             kind = scalar_stmt;
1280
1281           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1282                             factor, kind, stmt_info, 0, vect_prologue);
1283         }
1284     }
1285
1286   /* Now accumulate cost.  */
1287   void *target_cost_data = init_cost (loop);
1288   stmt_info_for_cost *si;
1289   int j;
1290   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291                     j, si)
1292     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1293                           si->kind, si->stmt_info, si->vectype,
1294                           si->misalign, vect_body);
1295   unsigned dummy, body_cost = 0;
1296   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1297   destroy_cost_data (target_cost_data);
1298   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1299 }
1300
1301
1302 /* Function vect_analyze_loop_form_1.
1303
1304    Verify that certain CFG restrictions hold, including:
1305    - the loop has a pre-header
1306    - the loop has a single entry and exit
1307    - the loop exit condition is simple enough
1308    - the number of iterations can be analyzed, i.e, a countable loop.  The
1309      niter could be analyzed under some assumptions.  */
1310
1311 opt_result
1312 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1313                           tree *assumptions, tree *number_of_iterationsm1,
1314                           tree *number_of_iterations, gcond **inner_loop_cond)
1315 {
1316   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1317
1318   /* Different restrictions apply when we are considering an inner-most loop,
1319      vs. an outer (nested) loop.
1320      (FORNOW. May want to relax some of these restrictions in the future).  */
1321
1322   if (!loop->inner)
1323     {
1324       /* Inner-most loop.  We currently require that the number of BBs is
1325          exactly 2 (the header and latch).  Vectorizable inner-most loops
1326          look like this:
1327
1328                         (pre-header)
1329                            |
1330                           header <--------+
1331                            | |            |
1332                            | +--> latch --+
1333                            |
1334                         (exit-bb)  */
1335
1336       if (loop->num_nodes != 2)
1337         return opt_result::failure_at (vect_location,
1338                                        "not vectorized:"
1339                                        " control flow in loop.\n");
1340
1341       if (empty_block_p (loop->header))
1342         return opt_result::failure_at (vect_location,
1343                                        "not vectorized: empty loop.\n");
1344     }
1345   else
1346     {
1347       class loop *innerloop = loop->inner;
1348       edge entryedge;
1349
1350       /* Nested loop. We currently require that the loop is doubly-nested,
1351          contains a single inner loop, and the number of BBs is exactly 5.
1352          Vectorizable outer-loops look like this:
1353
1354                         (pre-header)
1355                            |
1356                           header <---+
1357                            |         |
1358                           inner-loop |
1359                            |         |
1360                           tail ------+
1361                            |
1362                         (exit-bb)
1363
1364          The inner-loop has the properties expected of inner-most loops
1365          as described above.  */
1366
1367       if ((loop->inner)->inner || (loop->inner)->next)
1368         return opt_result::failure_at (vect_location,
1369                                        "not vectorized:"
1370                                        " multiple nested loops.\n");
1371
1372       if (loop->num_nodes != 5)
1373         return opt_result::failure_at (vect_location,
1374                                        "not vectorized:"
1375                                        " control flow in loop.\n");
1376
1377       entryedge = loop_preheader_edge (innerloop);
1378       if (entryedge->src != loop->header
1379           || !single_exit (innerloop)
1380           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1381         return opt_result::failure_at (vect_location,
1382                                        "not vectorized:"
1383                                        " unsupported outerloop form.\n");
1384
1385       /* Analyze the inner-loop.  */
1386       tree inner_niterm1, inner_niter, inner_assumptions;
1387       opt_result res
1388         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1389                                     &inner_assumptions, &inner_niterm1,
1390                                     &inner_niter, NULL);
1391       if (!res)
1392         {
1393           if (dump_enabled_p ())
1394             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1395                              "not vectorized: Bad inner loop.\n");
1396           return res;
1397         }
1398
1399       /* Don't support analyzing niter under assumptions for inner
1400          loop.  */
1401       if (!integer_onep (inner_assumptions))
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized: Bad inner loop.\n");
1404
1405       if (!expr_invariant_in_loop_p (loop, inner_niter))
1406         return opt_result::failure_at (vect_location,
1407                                        "not vectorized: inner-loop count not"
1408                                        " invariant.\n");
1409
1410       if (dump_enabled_p ())
1411         dump_printf_loc (MSG_NOTE, vect_location,
1412                          "Considering outer-loop vectorization.\n");
1413     }
1414
1415   if (!single_exit (loop))
1416     return opt_result::failure_at (vect_location,
1417                                    "not vectorized: multiple exits.\n");
1418   if (EDGE_COUNT (loop->header->preds) != 2)
1419     return opt_result::failure_at (vect_location,
1420                                    "not vectorized:"
1421                                    " too many incoming edges.\n");
1422
1423   /* We assume that the loop exit condition is at the end of the loop. i.e,
1424      that the loop is represented as a do-while (with a proper if-guard
1425      before the loop if needed), where the loop header contains all the
1426      executable statements, and the latch is empty.  */
1427   if (!empty_block_p (loop->latch)
1428       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1429     return opt_result::failure_at (vect_location,
1430                                    "not vectorized: latch block not empty.\n");
1431
1432   /* Make sure the exit is not abnormal.  */
1433   edge e = single_exit (loop);
1434   if (e->flags & EDGE_ABNORMAL)
1435     return opt_result::failure_at (vect_location,
1436                                    "not vectorized:"
1437                                    " abnormal loop exit edge.\n");
1438
1439   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1440                                      number_of_iterationsm1);
1441   if (!*loop_cond)
1442     return opt_result::failure_at
1443       (vect_location,
1444        "not vectorized: complicated exit condition.\n");
1445
1446   if (integer_zerop (*assumptions)
1447       || !*number_of_iterations
1448       || chrec_contains_undetermined (*number_of_iterations))
1449     return opt_result::failure_at
1450       (*loop_cond,
1451        "not vectorized: number of iterations cannot be computed.\n");
1452
1453   if (integer_zerop (*number_of_iterations))
1454     return opt_result::failure_at
1455       (*loop_cond,
1456        "not vectorized: number of iterations = 0.\n");
1457
1458   return opt_result::success ();
1459 }
1460
1461 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1462
1463 opt_loop_vec_info
1464 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1465 {
1466   tree assumptions, number_of_iterations, number_of_iterationsm1;
1467   gcond *loop_cond, *inner_loop_cond = NULL;
1468
1469   opt_result res
1470     = vect_analyze_loop_form_1 (loop, &loop_cond,
1471                                 &assumptions, &number_of_iterationsm1,
1472                                 &number_of_iterations, &inner_loop_cond);
1473   if (!res)
1474     return opt_loop_vec_info::propagate_failure (res);
1475
1476   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1477   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1478   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1479   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1480   if (!integer_onep (assumptions))
1481     {
1482       /* We consider to vectorize this loop by versioning it under
1483          some assumptions.  In order to do this, we need to clear
1484          existing information computed by scev and niter analyzer.  */
1485       scev_reset_htab ();
1486       free_numbers_of_iterations_estimates (loop);
1487       /* Also set flag for this loop so that following scev and niter
1488          analysis are done under the assumptions.  */
1489       loop_constraint_set (loop, LOOP_C_FINITE);
1490       /* Also record the assumptions for versioning.  */
1491       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1492     }
1493
1494   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1495     {
1496       if (dump_enabled_p ())
1497         {
1498           dump_printf_loc (MSG_NOTE, vect_location,
1499                            "Symbolic number of iterations is ");
1500           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1501           dump_printf (MSG_NOTE, "\n");
1502         }
1503     }
1504
1505   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1506   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1507   if (inner_loop_cond)
1508     {
1509       stmt_vec_info inner_loop_cond_info
1510         = loop_vinfo->lookup_stmt (inner_loop_cond);
1511       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1512     }
1513
1514   gcc_assert (!loop->aux);
1515   loop->aux = loop_vinfo;
1516   return opt_loop_vec_info::success (loop_vinfo);
1517 }
1518
1519
1520
1521 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1522    statements update the vectorization factor.  */
1523
1524 static void
1525 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1526 {
1527   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1528   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1529   int nbbs = loop->num_nodes;
1530   poly_uint64 vectorization_factor;
1531   int i;
1532
1533   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1534
1535   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1536   gcc_assert (known_ne (vectorization_factor, 0U));
1537
1538   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1539      vectorization factor of the loop is the unrolling factor required by
1540      the SLP instances.  If that unrolling factor is 1, we say, that we
1541      perform pure SLP on loop - cross iteration parallelism is not
1542      exploited.  */
1543   bool only_slp_in_loop = true;
1544   for (i = 0; i < nbbs; i++)
1545     {
1546       basic_block bb = bbs[i];
1547       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1548            gsi_next (&si))
1549         {
1550           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1551           if (!stmt_info)
1552             continue;
1553           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1554                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1555               && !PURE_SLP_STMT (stmt_info))
1556             /* STMT needs both SLP and loop-based vectorization.  */
1557             only_slp_in_loop = false;
1558         }
1559       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1560            gsi_next (&si))
1561         {
1562           if (is_gimple_debug (gsi_stmt (si)))
1563             continue;
1564           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1565           stmt_info = vect_stmt_to_vectorize (stmt_info);
1566           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1567                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1568               && !PURE_SLP_STMT (stmt_info))
1569             /* STMT needs both SLP and loop-based vectorization.  */
1570             only_slp_in_loop = false;
1571         }
1572     }
1573
1574   if (only_slp_in_loop)
1575     {
1576       if (dump_enabled_p ())
1577         dump_printf_loc (MSG_NOTE, vect_location,
1578                          "Loop contains only SLP stmts\n");
1579       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1580     }
1581   else
1582     {
1583       if (dump_enabled_p ())
1584         dump_printf_loc (MSG_NOTE, vect_location,
1585                          "Loop contains SLP and non-SLP stmts\n");
1586       /* Both the vectorization factor and unroll factor have the form
1587          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1588          so they must have a common multiple.  */
1589       vectorization_factor
1590         = force_common_multiple (vectorization_factor,
1591                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1592     }
1593
1594   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1595   if (dump_enabled_p ())
1596     {
1597       dump_printf_loc (MSG_NOTE, vect_location,
1598                        "Updating vectorization factor to ");
1599       dump_dec (MSG_NOTE, vectorization_factor);
1600       dump_printf (MSG_NOTE, ".\n");
1601     }
1602 }
1603
1604 /* Return true if STMT_INFO describes a double reduction phi and if
1605    the other phi in the reduction is also relevant for vectorization.
1606    This rejects cases such as:
1607
1608       outer1:
1609         x_1 = PHI <x_3(outer2), ...>;
1610         ...
1611
1612       inner:
1613         x_2 = ...;
1614         ...
1615
1616       outer2:
1617         x_3 = PHI <x_2(inner)>;
1618
1619    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1620
1621 static bool
1622 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1623 {
1624   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1625     return false;
1626
1627   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1628 }
1629
1630 /* Function vect_analyze_loop_operations.
1631
1632    Scan the loop stmts and make sure they are all vectorizable.  */
1633
1634 static opt_result
1635 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1636 {
1637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1638   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1639   int nbbs = loop->num_nodes;
1640   int i;
1641   stmt_vec_info stmt_info;
1642   bool need_to_vectorize = false;
1643   bool ok;
1644
1645   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1646
1647   auto_vec<stmt_info_for_cost> cost_vec;
1648
1649   for (i = 0; i < nbbs; i++)
1650     {
1651       basic_block bb = bbs[i];
1652
1653       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1654            gsi_next (&si))
1655         {
1656           gphi *phi = si.phi ();
1657           ok = true;
1658
1659           stmt_info = loop_vinfo->lookup_stmt (phi);
1660           if (dump_enabled_p ())
1661             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1662           if (virtual_operand_p (gimple_phi_result (phi)))
1663             continue;
1664
1665           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1666              (i.e., a phi in the tail of the outer-loop).  */
1667           if (! is_loop_header_bb_p (bb))
1668             {
1669               /* FORNOW: we currently don't support the case that these phis
1670                  are not used in the outerloop (unless it is double reduction,
1671                  i.e., this phi is vect_reduction_def), cause this case
1672                  requires to actually do something here.  */
1673               if (STMT_VINFO_LIVE_P (stmt_info)
1674                   && !vect_active_double_reduction_p (stmt_info))
1675                 return opt_result::failure_at (phi,
1676                                                "Unsupported loop-closed phi"
1677                                                " in outer-loop.\n");
1678
1679               /* If PHI is used in the outer loop, we check that its operand
1680                  is defined in the inner loop.  */
1681               if (STMT_VINFO_RELEVANT_P (stmt_info))
1682                 {
1683                   tree phi_op;
1684
1685                   if (gimple_phi_num_args (phi) != 1)
1686                     return opt_result::failure_at (phi, "unsupported phi");
1687
1688                   phi_op = PHI_ARG_DEF (phi, 0);
1689                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1690                   if (!op_def_info)
1691                     return opt_result::failure_at (phi, "unsupported phi\n");
1692
1693                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1694                       && (STMT_VINFO_RELEVANT (op_def_info)
1695                           != vect_used_in_outer_by_reduction))
1696                     return opt_result::failure_at (phi, "unsupported phi\n");
1697
1698                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1699                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1700                            == vect_double_reduction_def))
1701                       && !vectorizable_lc_phi (loop_vinfo,
1702                                                stmt_info, NULL, NULL))
1703                     return opt_result::failure_at (phi, "unsupported phi\n");
1704                 }
1705
1706               continue;
1707             }
1708
1709           gcc_assert (stmt_info);
1710
1711           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1712                || STMT_VINFO_LIVE_P (stmt_info))
1713               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1714             /* A scalar-dependence cycle that we don't support.  */
1715             return opt_result::failure_at (phi,
1716                                            "not vectorized:"
1717                                            " scalar dependence cycle.\n");
1718
1719           if (STMT_VINFO_RELEVANT_P (stmt_info))
1720             {
1721               need_to_vectorize = true;
1722               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1723                   && ! PURE_SLP_STMT (stmt_info))
1724                 ok = vectorizable_induction (loop_vinfo,
1725                                              stmt_info, NULL, NULL,
1726                                              &cost_vec);
1727               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1728                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1729                             == vect_double_reduction_def)
1730                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731                        && ! PURE_SLP_STMT (stmt_info))
1732                 ok = vectorizable_reduction (loop_vinfo,
1733                                              stmt_info, NULL, NULL, &cost_vec);
1734             }
1735
1736           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1737           if (ok
1738               && STMT_VINFO_LIVE_P (stmt_info)
1739               && !PURE_SLP_STMT (stmt_info))
1740             ok = vectorizable_live_operation (loop_vinfo,
1741                                               stmt_info, NULL, NULL, NULL,
1742                                               -1, false, &cost_vec);
1743
1744           if (!ok)
1745             return opt_result::failure_at (phi,
1746                                            "not vectorized: relevant phi not "
1747                                            "supported: %G",
1748                                            static_cast <gimple *> (phi));
1749         }
1750
1751       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752            gsi_next (&si))
1753         {
1754           gimple *stmt = gsi_stmt (si);
1755           if (!gimple_clobber_p (stmt)
1756               && !is_gimple_debug (stmt))
1757             {
1758               opt_result res
1759                 = vect_analyze_stmt (loop_vinfo,
1760                                      loop_vinfo->lookup_stmt (stmt),
1761                                      &need_to_vectorize,
1762                                      NULL, NULL, &cost_vec);
1763               if (!res)
1764                 return res;
1765             }
1766         }
1767     } /* bbs */
1768
1769   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1770
1771   /* All operations in the loop are either irrelevant (deal with loop
1772      control, or dead), or only used outside the loop and can be moved
1773      out of the loop (e.g. invariants, inductions).  The loop can be
1774      optimized away by scalar optimizations.  We're better off not
1775      touching this loop.  */
1776   if (!need_to_vectorize)
1777     {
1778       if (dump_enabled_p ())
1779         dump_printf_loc (MSG_NOTE, vect_location,
1780                          "All the computation can be taken out of the loop.\n");
1781       return opt_result::failure_at
1782         (vect_location,
1783          "not vectorized: redundant loop. no profit to vectorize.\n");
1784     }
1785
1786   return opt_result::success ();
1787 }
1788
1789 /* Return true if we know that the iteration count is smaller than the
1790    vectorization factor.  Return false if it isn't, or if we can't be sure
1791    either way.  */
1792
1793 static bool
1794 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1795 {
1796   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1797
1798   HOST_WIDE_INT max_niter;
1799   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1800     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1801   else
1802     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1803
1804   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1805     return true;
1806
1807   return false;
1808 }
1809
1810 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1811    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1812    definitely no, or -1 if it's worth retrying.  */
1813
1814 static int
1815 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1816 {
1817   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1818   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1819
1820   /* Only loops that can handle partially-populated vectors can have iteration
1821      counts less than the vectorization factor.  */
1822   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1823     {
1824       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1825         {
1826           if (dump_enabled_p ())
1827             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828                              "not vectorized: iteration count smaller than "
1829                              "vectorization factor.\n");
1830           return 0;
1831         }
1832     }
1833
1834   /* If using the "very cheap" model. reject cases in which we'd keep
1835      a copy of the scalar code (even if we might be able to vectorize it).  */
1836   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1837       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1838           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1839           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1840     {
1841       if (dump_enabled_p ())
1842         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843                          "some scalar iterations would need to be peeled\n");
1844       return 0;
1845     }
1846
1847   int min_profitable_iters, min_profitable_estimate;
1848   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1849                                       &min_profitable_estimate);
1850
1851   if (min_profitable_iters < 0)
1852     {
1853       if (dump_enabled_p ())
1854         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855                          "not vectorized: vectorization not profitable.\n");
1856       if (dump_enabled_p ())
1857         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858                          "not vectorized: vector version will never be "
1859                          "profitable.\n");
1860       return -1;
1861     }
1862
1863   int min_scalar_loop_bound = (param_min_vect_loop_bound
1864                                * assumed_vf);
1865
1866   /* Use the cost model only if it is more conservative than user specified
1867      threshold.  */
1868   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1869                                     min_profitable_iters);
1870
1871   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1872
1873   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1874       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1875     {
1876       if (dump_enabled_p ())
1877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878                          "not vectorized: vectorization not profitable.\n");
1879       if (dump_enabled_p ())
1880         dump_printf_loc (MSG_NOTE, vect_location,
1881                          "not vectorized: iteration count smaller than user "
1882                          "specified loop bound parameter or minimum profitable "
1883                          "iterations (whichever is more conservative).\n");
1884       return 0;
1885     }
1886
1887   /* The static profitablity threshold min_profitable_estimate includes
1888      the cost of having to check at runtime whether the scalar loop
1889      should be used instead.  If it turns out that we don't need or want
1890      such a check, the threshold we should use for the static estimate
1891      is simply the point at which the vector loop becomes more profitable
1892      than the scalar loop.  */
1893   if (min_profitable_estimate > min_profitable_iters
1894       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1896       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1897       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1898     {
1899       if (dump_enabled_p ())
1900         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1901                          " choice between the scalar and vector loops\n");
1902       min_profitable_estimate = min_profitable_iters;
1903     }
1904
1905   /* If the vector loop needs multiple iterations to be beneficial then
1906      things are probably too close to call, and the conservative thing
1907      would be to stick with the scalar code.  */
1908   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1909       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1910     {
1911       if (dump_enabled_p ())
1912         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1913                          "one iteration of the vector loop would be"
1914                          " more expensive than the equivalent number of"
1915                          " iterations of the scalar loop\n");
1916       return 0;
1917     }
1918
1919   HOST_WIDE_INT estimated_niter;
1920
1921   /* If we are vectorizing an epilogue then we know the maximum number of
1922      scalar iterations it will cover is at least one lower than the
1923      vectorization factor of the main loop.  */
1924   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1925     estimated_niter
1926       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1927   else
1928     {
1929       estimated_niter = estimated_stmt_executions_int (loop);
1930       if (estimated_niter == -1)
1931         estimated_niter = likely_max_stmt_executions_int (loop);
1932     }
1933   if (estimated_niter != -1
1934       && ((unsigned HOST_WIDE_INT) estimated_niter
1935           < MAX (th, (unsigned) min_profitable_estimate)))
1936     {
1937       if (dump_enabled_p ())
1938         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939                          "not vectorized: estimated iteration count too "
1940                          "small.\n");
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_NOTE, vect_location,
1943                          "not vectorized: estimated iteration count smaller "
1944                          "than specified loop bound parameter or minimum "
1945                          "profitable iterations (whichever is more "
1946                          "conservative).\n");
1947       return -1;
1948     }
1949
1950   return 1;
1951 }
1952
1953 static opt_result
1954 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1955                            vec<data_reference_p> *datarefs,
1956                            unsigned int *n_stmts)
1957 {
1958   *n_stmts = 0;
1959   for (unsigned i = 0; i < loop->num_nodes; i++)
1960     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1961          !gsi_end_p (gsi); gsi_next (&gsi))
1962       {
1963         gimple *stmt = gsi_stmt (gsi);
1964         if (is_gimple_debug (stmt))
1965           continue;
1966         ++(*n_stmts);
1967         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1968                                                         NULL, 0);
1969         if (!res)
1970           {
1971             if (is_gimple_call (stmt) && loop->safelen)
1972               {
1973                 tree fndecl = gimple_call_fndecl (stmt), op;
1974                 if (fndecl != NULL_TREE)
1975                   {
1976                     cgraph_node *node = cgraph_node::get (fndecl);
1977                     if (node != NULL && node->simd_clones != NULL)
1978                       {
1979                         unsigned int j, n = gimple_call_num_args (stmt);
1980                         for (j = 0; j < n; j++)
1981                           {
1982                             op = gimple_call_arg (stmt, j);
1983                             if (DECL_P (op)
1984                                 || (REFERENCE_CLASS_P (op)
1985                                     && get_base_address (op)))
1986                               break;
1987                           }
1988                         op = gimple_call_lhs (stmt);
1989                         /* Ignore #pragma omp declare simd functions
1990                            if they don't have data references in the
1991                            call stmt itself.  */
1992                         if (j == n
1993                             && !(op
1994                                  && (DECL_P (op)
1995                                      || (REFERENCE_CLASS_P (op)
1996                                          && get_base_address (op)))))
1997                           continue;
1998                       }
1999                   }
2000               }
2001             return res;
2002           }
2003         /* If dependence analysis will give up due to the limit on the
2004            number of datarefs stop here and fail fatally.  */
2005         if (datarefs->length ()
2006             > (unsigned)param_loop_max_datarefs_for_datadeps)
2007           return opt_result::failure_at (stmt, "exceeded param "
2008                                          "loop-max-datarefs-for-datadeps\n");
2009       }
2010   return opt_result::success ();
2011 }
2012
2013 /* Look for SLP-only access groups and turn each individual access into its own
2014    group.  */
2015 static void
2016 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2017 {
2018   unsigned int i;
2019   struct data_reference *dr;
2020
2021   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2022
2023   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2024   FOR_EACH_VEC_ELT (datarefs, i, dr)
2025     {
2026       gcc_assert (DR_REF (dr));
2027       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2028
2029       /* Check if the load is a part of an interleaving chain.  */
2030       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031         {
2032           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2033           unsigned int group_size = DR_GROUP_SIZE (first_element);
2034
2035           /* Check if SLP-only groups.  */
2036           if (!STMT_SLP_TYPE (stmt_info)
2037               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2038             {
2039               /* Dissolve the group.  */
2040               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2041
2042               stmt_vec_info vinfo = first_element;
2043               while (vinfo)
2044                 {
2045                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2046                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2047                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2048                   DR_GROUP_SIZE (vinfo) = 1;
2049                   if (STMT_VINFO_STRIDED_P (first_element))
2050                     DR_GROUP_GAP (vinfo) = 0;
2051                   else
2052                     DR_GROUP_GAP (vinfo) = group_size - 1;
2053                   vinfo = next;
2054                 }
2055             }
2056         }
2057     }
2058 }
2059
2060 /* Determine if operating on full vectors for LOOP_VINFO might leave
2061    some scalar iterations still to do.  If so, decide how we should
2062    handle those scalar iterations.  The possibilities are:
2063
2064    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2065        In this case:
2066
2067          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2068          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2069          LOOP_VINFO_PEELING_FOR_NITER == false
2070
2071    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2072        to handle the remaining scalar iterations.  In this case:
2073
2074          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2075          LOOP_VINFO_PEELING_FOR_NITER == true
2076
2077        There are two choices:
2078
2079        (2a) Consider vectorizing the epilogue loop at the same VF as the
2080             main loop, but using partial vectors instead of full vectors.
2081             In this case:
2082
2083               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2084
2085        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2086             In this case:
2087
2088               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2089
2090    When FOR_EPILOGUE_P is true, make this determination based on the
2091    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2092    based on the assumption that LOOP_VINFO is the main loop.  The caller
2093    has made sure that the number of iterations is set appropriately for
2094    this value of FOR_EPILOGUE_P.  */
2095
2096 opt_result
2097 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2098                                             bool for_epilogue_p)
2099 {
2100   /* Determine whether there would be any scalar iterations left over.  */
2101   bool need_peeling_or_partial_vectors_p
2102     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2103
2104   /* Decide whether to vectorize the loop with partial vectors.  */
2105   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2106   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2107   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2108       && need_peeling_or_partial_vectors_p)
2109     {
2110       /* For partial-vector-usage=1, try to push the handling of partial
2111          vectors to the epilogue, with the main loop continuing to operate
2112          on full vectors.
2113
2114          ??? We could then end up failing to use partial vectors if we
2115          decide to peel iterations into a prologue, and if the main loop
2116          then ends up processing fewer than VF iterations.  */
2117       if (param_vect_partial_vector_usage == 1
2118           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2119           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2120         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2121       else
2122         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2123     }
2124
2125   if (dump_enabled_p ())
2126     {
2127       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2128         dump_printf_loc (MSG_NOTE, vect_location,
2129                          "operating on partial vectors%s.\n",
2130                          for_epilogue_p ? " for epilogue loop" : "");
2131       else
2132         dump_printf_loc (MSG_NOTE, vect_location,
2133                          "operating only on full vectors%s.\n",
2134                          for_epilogue_p ? " for epilogue loop" : "");
2135     }
2136
2137   if (for_epilogue_p)
2138     {
2139       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2140       gcc_assert (orig_loop_vinfo);
2141       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2142         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2143                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2144     }
2145
2146   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2147       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2148     {
2149       /* Check that the loop processes at least one full vector.  */
2150       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2151       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2152       if (known_lt (wi::to_widest (scalar_niters), vf))
2153         return opt_result::failure_at (vect_location,
2154                                        "loop does not have enough iterations"
2155                                        " to support vectorization.\n");
2156
2157       /* If we need to peel an extra epilogue iteration to handle data
2158          accesses with gaps, check that there are enough scalar iterations
2159          available.
2160
2161          The check above is redundant with this one when peeling for gaps,
2162          but the distinction is useful for diagnostics.  */
2163       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2164       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2166         return opt_result::failure_at (vect_location,
2167                                        "loop does not have enough iterations"
2168                                        " to support peeling for gaps.\n");
2169     }
2170
2171   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2172     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2173        && need_peeling_or_partial_vectors_p);
2174
2175   return opt_result::success ();
2176 }
2177
2178 /* Function vect_analyze_loop_2.
2179
2180    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2181    for it.  The different analyses will record information in the
2182    loop_vec_info struct.  */
2183 static opt_result
2184 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2185 {
2186   opt_result ok = opt_result::success ();
2187   int res;
2188   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2189   poly_uint64 min_vf = 2;
2190   loop_vec_info orig_loop_vinfo = NULL;
2191
2192   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2193      loop_vec_info of the first vectorized loop.  */
2194   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2195     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2196   else
2197     orig_loop_vinfo = loop_vinfo;
2198   gcc_assert (orig_loop_vinfo);
2199
2200   /* The first group of checks is independent of the vector size.  */
2201   fatal = true;
2202
2203   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2204       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2205     return opt_result::failure_at (vect_location,
2206                                    "not vectorized: simd if(0)\n");
2207
2208   /* Find all data references in the loop (which correspond to vdefs/vuses)
2209      and analyze their evolution in the loop.  */
2210
2211   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2212
2213   /* Gather the data references and count stmts in the loop.  */
2214   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2215     {
2216       opt_result res
2217         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2218                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2219                                      n_stmts);
2220       if (!res)
2221         {
2222           if (dump_enabled_p ())
2223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224                              "not vectorized: loop contains function "
2225                              "calls or data references that cannot "
2226                              "be analyzed\n");
2227           return res;
2228         }
2229       loop_vinfo->shared->save_datarefs ();
2230     }
2231   else
2232     loop_vinfo->shared->check_datarefs ();
2233
2234   /* Analyze the data references and also adjust the minimal
2235      vectorization factor according to the loads and stores.  */
2236
2237   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2238   if (!ok)
2239     {
2240       if (dump_enabled_p ())
2241         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242                          "bad data references.\n");
2243       return ok;
2244     }
2245
2246   /* Classify all cross-iteration scalar data-flow cycles.
2247      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2248   vect_analyze_scalar_cycles (loop_vinfo);
2249
2250   vect_pattern_recog (loop_vinfo);
2251
2252   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2253
2254   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2255      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2256
2257   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2258   if (!ok)
2259     {
2260       if (dump_enabled_p ())
2261         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262                          "bad data access.\n");
2263       return ok;
2264     }
2265
2266   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2267
2268   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2269   if (!ok)
2270     {
2271       if (dump_enabled_p ())
2272         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273                          "unexpected pattern.\n");
2274       return ok;
2275     }
2276
2277   /* While the rest of the analysis below depends on it in some way.  */
2278   fatal = false;
2279
2280   /* Analyze data dependences between the data-refs in the loop
2281      and adjust the maximum vectorization factor according to
2282      the dependences.
2283      FORNOW: fail at the first data dependence that we encounter.  */
2284
2285   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2286   if (!ok)
2287     {
2288       if (dump_enabled_p ())
2289         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290                          "bad data dependence.\n");
2291       return ok;
2292     }
2293   if (max_vf != MAX_VECTORIZATION_FACTOR
2294       && maybe_lt (max_vf, min_vf))
2295     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2296   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2297
2298   ok = vect_determine_vectorization_factor (loop_vinfo);
2299   if (!ok)
2300     {
2301       if (dump_enabled_p ())
2302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303                          "can't determine vectorization factor.\n");
2304       return ok;
2305     }
2306   if (max_vf != MAX_VECTORIZATION_FACTOR
2307       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2308     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2309
2310   /* Compute the scalar iteration cost.  */
2311   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2312
2313   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2314
2315   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2316   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2317   if (!ok)
2318     return ok;
2319
2320   /* If there are any SLP instances mark them as pure_slp.  */
2321   bool slp = vect_make_slp_decision (loop_vinfo);
2322   if (slp)
2323     {
2324       /* Find stmts that need to be both vectorized and SLPed.  */
2325       vect_detect_hybrid_slp (loop_vinfo);
2326
2327       /* Update the vectorization factor based on the SLP decision.  */
2328       vect_update_vf_for_slp (loop_vinfo);
2329
2330       /* Optimize the SLP graph with the vectorization factor fixed.  */
2331       vect_optimize_slp (loop_vinfo);
2332
2333       /* Gather the loads reachable from the SLP graph entries.  */
2334       vect_gather_slp_loads (loop_vinfo);
2335     }
2336
2337   bool saved_can_use_partial_vectors_p
2338     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2339
2340   /* We don't expect to have to roll back to anything other than an empty
2341      set of rgroups.  */
2342   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2343
2344   /* This is the point where we can re-start analysis with SLP forced off.  */
2345 start_over:
2346
2347   /* Now the vectorization factor is final.  */
2348   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2349   gcc_assert (known_ne (vectorization_factor, 0U));
2350
2351   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2352     {
2353       dump_printf_loc (MSG_NOTE, vect_location,
2354                        "vectorization_factor = ");
2355       dump_dec (MSG_NOTE, vectorization_factor);
2356       dump_printf (MSG_NOTE, ", niters = %wd\n",
2357                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2358     }
2359
2360   /* Analyze the alignment of the data-refs in the loop.
2361      Fail if a data reference is found that cannot be vectorized.  */
2362
2363   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2364   if (!ok)
2365     {
2366       if (dump_enabled_p ())
2367         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368                          "bad data alignment.\n");
2369       return ok;
2370     }
2371
2372   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2373      It is important to call pruning after vect_analyze_data_ref_accesses,
2374      since we use grouping information gathered by interleaving analysis.  */
2375   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2376   if (!ok)
2377     return ok;
2378
2379   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2380      vectorization, since we do not want to add extra peeling or
2381      add versioning for alignment.  */
2382   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2383     /* This pass will decide on using loop versioning and/or loop peeling in
2384        order to enhance the alignment of data references in the loop.  */
2385     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2386   if (!ok)
2387     return ok;
2388
2389   if (slp)
2390     {
2391       /* Analyze operations in the SLP instances.  Note this may
2392          remove unsupported SLP instances which makes the above
2393          SLP kind detection invalid.  */
2394       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2395       vect_slp_analyze_operations (loop_vinfo);
2396       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2397         {
2398           ok = opt_result::failure_at (vect_location,
2399                                        "unsupported SLP instances\n");
2400           goto again;
2401         }
2402
2403       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2404       slp_tree load_node, slp_root;
2405       unsigned i, x;
2406       slp_instance instance;
2407       bool can_use_lanes = true;
2408       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2409         {
2410           slp_root = SLP_INSTANCE_TREE (instance);
2411           int group_size = SLP_TREE_LANES (slp_root);
2412           tree vectype = SLP_TREE_VECTYPE (slp_root);
2413           bool loads_permuted = false;
2414           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2415             {
2416               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2417                 continue;
2418               unsigned j;
2419               stmt_vec_info load_info;
2420               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2421                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2422                   {
2423                     loads_permuted = true;
2424                     break;
2425                   }
2426             }
2427
2428           /* If the loads and stores can be handled with load/store-lane
2429              instructions record it and move on to the next instance.  */
2430           if (loads_permuted
2431               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2432               && vect_store_lanes_supported (vectype, group_size, false))
2433             {
2434               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2435                 {
2436                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2437                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2438                   /* Use SLP for strided accesses (or if we can't
2439                      load-lanes).  */
2440                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2441                       || ! vect_load_lanes_supported
2442                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2443                              DR_GROUP_SIZE (stmt_vinfo), false))
2444                     break;
2445                 }
2446
2447               can_use_lanes
2448                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2449
2450               if (can_use_lanes && dump_enabled_p ())
2451                 dump_printf_loc (MSG_NOTE, vect_location,
2452                                  "SLP instance %p can use load/store-lanes\n",
2453                                  instance);
2454             }
2455           else
2456             {
2457               can_use_lanes = false;
2458               break;
2459             }
2460         }
2461
2462       /* If all SLP instances can use load/store-lanes abort SLP and try again
2463          with SLP disabled.  */
2464       if (can_use_lanes)
2465         {
2466           ok = opt_result::failure_at (vect_location,
2467                                        "Built SLP cancelled: can use "
2468                                        "load/store-lanes\n");
2469           if (dump_enabled_p ())
2470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471                              "Built SLP cancelled: all SLP instances support "
2472                              "load/store-lanes\n");
2473           goto again;
2474         }
2475     }
2476
2477   /* Dissolve SLP-only groups.  */
2478   vect_dissolve_slp_only_groups (loop_vinfo);
2479
2480   /* Scan all the remaining operations in the loop that are not subject
2481      to SLP and make sure they are vectorizable.  */
2482   ok = vect_analyze_loop_operations (loop_vinfo);
2483   if (!ok)
2484     {
2485       if (dump_enabled_p ())
2486         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2487                          "bad operation or unsupported loop bound.\n");
2488       return ok;
2489     }
2490
2491   /* For now, we don't expect to mix both masking and length approaches for one
2492      loop, disable it if both are recorded.  */
2493   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2494       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2495       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2496     {
2497       if (dump_enabled_p ())
2498         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499                          "can't vectorize a loop with partial vectors"
2500                          " because we don't expect to mix different"
2501                          " approaches with partial vectors for the"
2502                          " same loop.\n");
2503       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2504     }
2505
2506   /* If we still have the option of using partial vectors,
2507      check whether we can generate the necessary loop controls.  */
2508   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2509       && !vect_verify_full_masking (loop_vinfo)
2510       && !vect_verify_loop_lens (loop_vinfo))
2511     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2512
2513   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2514      to be able to handle fewer than VF scalars, or needs to have a lower VF
2515      than the main loop.  */
2516   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2517       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2518       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2519                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2520     return opt_result::failure_at (vect_location,
2521                                    "Vectorization factor too high for"
2522                                    " epilogue loop.\n");
2523
2524   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2525      assuming that the loop will be used as a main loop.  We will redo
2526      this analysis later if we instead decide to use the loop as an
2527      epilogue loop.  */
2528   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2529   if (!ok)
2530     return ok;
2531
2532   /* Check the costings of the loop make vectorizing worthwhile.  */
2533   res = vect_analyze_loop_costing (loop_vinfo);
2534   if (res < 0)
2535     {
2536       ok = opt_result::failure_at (vect_location,
2537                                    "Loop costings may not be worthwhile.\n");
2538       goto again;
2539     }
2540   if (!res)
2541     return opt_result::failure_at (vect_location,
2542                                    "Loop costings not worthwhile.\n");
2543
2544   /* If an epilogue loop is required make sure we can create one.  */
2545   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2546       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2547     {
2548       if (dump_enabled_p ())
2549         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2550       if (!vect_can_advance_ivs_p (loop_vinfo)
2551           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2552                                            single_exit (LOOP_VINFO_LOOP
2553                                                          (loop_vinfo))))
2554         {
2555           ok = opt_result::failure_at (vect_location,
2556                                        "not vectorized: can't create required "
2557                                        "epilog loop\n");
2558           goto again;
2559         }
2560     }
2561
2562   /* During peeling, we need to check if number of loop iterations is
2563      enough for both peeled prolog loop and vector loop.  This check
2564      can be merged along with threshold check of loop versioning, so
2565      increase threshold for this case if necessary.
2566
2567      If we are analyzing an epilogue we still want to check what its
2568      versioning threshold would be.  If we decide to vectorize the epilogues we
2569      will want to use the lowest versioning threshold of all epilogues and main
2570      loop.  This will enable us to enter a vectorized epilogue even when
2571      versioning the loop.  We can't simply check whether the epilogue requires
2572      versioning though since we may have skipped some versioning checks when
2573      analyzing the epilogue.  For instance, checks for alias versioning will be
2574      skipped when dealing with epilogues as we assume we already checked them
2575      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2576   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2577     {
2578       poly_uint64 niters_th = 0;
2579       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2580
2581       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2582         {
2583           /* Niters for peeled prolog loop.  */
2584           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2585             {
2586               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2587               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2588               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2589             }
2590           else
2591             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2592         }
2593
2594       /* Niters for at least one iteration of vectorized loop.  */
2595       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2596         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2597       /* One additional iteration because of peeling for gap.  */
2598       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2599         niters_th += 1;
2600
2601       /*  Use the same condition as vect_transform_loop to decide when to use
2602           the cost to determine a versioning threshold.  */
2603       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2604           && ordered_p (th, niters_th))
2605         niters_th = ordered_max (poly_uint64 (th), niters_th);
2606
2607       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2608     }
2609
2610   gcc_assert (known_eq (vectorization_factor,
2611                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2612
2613   /* Ok to vectorize!  */
2614   return opt_result::success ();
2615
2616 again:
2617   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2618   gcc_assert (!ok);
2619
2620   /* Try again with SLP forced off but if we didn't do any SLP there is
2621      no point in re-trying.  */
2622   if (!slp)
2623     return ok;
2624
2625   /* If there are reduction chains re-trying will fail anyway.  */
2626   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2627     return ok;
2628
2629   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2630      via interleaving or lane instructions.  */
2631   slp_instance instance;
2632   slp_tree node;
2633   unsigned i, j;
2634   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2635     {
2636       stmt_vec_info vinfo;
2637       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2638       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2639         continue;
2640       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2641       unsigned int size = DR_GROUP_SIZE (vinfo);
2642       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2643       if (! vect_store_lanes_supported (vectype, size, false)
2644          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2645          && ! vect_grouped_store_supported (vectype, size))
2646         return opt_result::failure_at (vinfo->stmt,
2647                                        "unsupported grouped store\n");
2648       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2649         {
2650           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2651           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2652           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2653           size = DR_GROUP_SIZE (vinfo);
2654           vectype = STMT_VINFO_VECTYPE (vinfo);
2655           if (! vect_load_lanes_supported (vectype, size, false)
2656               && ! vect_grouped_load_supported (vectype, single_element_p,
2657                                                 size))
2658             return opt_result::failure_at (vinfo->stmt,
2659                                            "unsupported grouped load\n");
2660         }
2661     }
2662
2663   if (dump_enabled_p ())
2664     dump_printf_loc (MSG_NOTE, vect_location,
2665                      "re-trying with SLP disabled\n");
2666
2667   /* Roll back state appropriately.  No SLP this time.  */
2668   slp = false;
2669   /* Restore vectorization factor as it were without SLP.  */
2670   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2671   /* Free the SLP instances.  */
2672   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2673     vect_free_slp_instance (instance);
2674   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2675   /* Reset SLP type to loop_vect on all stmts.  */
2676   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2677     {
2678       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2679       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2680            !gsi_end_p (si); gsi_next (&si))
2681         {
2682           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2683           STMT_SLP_TYPE (stmt_info) = loop_vect;
2684           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2685               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2686             {
2687               /* vectorizable_reduction adjusts reduction stmt def-types,
2688                  restore them to that of the PHI.  */
2689               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2690                 = STMT_VINFO_DEF_TYPE (stmt_info);
2691               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2692                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2693                 = STMT_VINFO_DEF_TYPE (stmt_info);
2694             }
2695         }
2696       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2697            !gsi_end_p (si); gsi_next (&si))
2698         {
2699           if (is_gimple_debug (gsi_stmt (si)))
2700             continue;
2701           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2702           STMT_SLP_TYPE (stmt_info) = loop_vect;
2703           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2704             {
2705               stmt_vec_info pattern_stmt_info
2706                 = STMT_VINFO_RELATED_STMT (stmt_info);
2707               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2708                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2709
2710               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2711               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2712               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2713                    !gsi_end_p (pi); gsi_next (&pi))
2714                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2715                   = loop_vect;
2716             }
2717         }
2718     }
2719   /* Free optimized alias test DDRS.  */
2720   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2721   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2722   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2723   /* Reset target cost data.  */
2724   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2725   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2726     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2727   /* Reset accumulated rgroup information.  */
2728   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2729   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2730   /* Reset assorted flags.  */
2731   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2732   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2733   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2734   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2735   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2736     = saved_can_use_partial_vectors_p;
2737
2738   goto start_over;
2739 }
2740
2741 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2742    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2743    OLD_LOOP_VINFO is better unless something specifically indicates
2744    otherwise.
2745
2746    Note that this deliberately isn't a partial order.  */
2747
2748 static bool
2749 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2750                           loop_vec_info old_loop_vinfo)
2751 {
2752   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2753   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2754
2755   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2756   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2757
2758   /* Always prefer a VF of loop->simdlen over any other VF.  */
2759   if (loop->simdlen)
2760     {
2761       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2762       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2763       if (new_simdlen_p != old_simdlen_p)
2764         return new_simdlen_p;
2765     }
2766
2767   /* Limit the VFs to what is likely to be the maximum number of iterations,
2768      to handle cases in which at least one loop_vinfo is fully-masked.  */
2769   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2770   if (estimated_max_niter != -1)
2771     {
2772       if (known_le (estimated_max_niter, new_vf))
2773         new_vf = estimated_max_niter;
2774       if (known_le (estimated_max_niter, old_vf))
2775         old_vf = estimated_max_niter;
2776     }
2777
2778   /* Check whether the (fractional) cost per scalar iteration is lower
2779      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2780   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2781   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2782
2783   HOST_WIDE_INT est_rel_new_min
2784     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2785   HOST_WIDE_INT est_rel_new_max
2786     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2787
2788   HOST_WIDE_INT est_rel_old_min
2789     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2790   HOST_WIDE_INT est_rel_old_max
2791     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2792
2793   /* Check first if we can make out an unambigous total order from the minimum
2794      and maximum estimates.  */
2795   if (est_rel_new_min < est_rel_old_min
2796       && est_rel_new_max < est_rel_old_max)
2797     return true;
2798   else if (est_rel_old_min < est_rel_new_min
2799            && est_rel_old_max < est_rel_new_max)
2800     return false;
2801   /* When old_loop_vinfo uses a variable vectorization factor,
2802      we know that it has a lower cost for at least one runtime VF.
2803      However, we don't know how likely that VF is.
2804
2805      One option would be to compare the costs for the estimated VFs.
2806      The problem is that that can put too much pressure on the cost
2807      model.  E.g. if the estimated VF is also the lowest possible VF,
2808      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2809      for the estimated VF, we'd then choose new_loop_vinfo even
2810      though (a) new_loop_vinfo might not actually be better than
2811      old_loop_vinfo for that VF and (b) it would be significantly
2812      worse at larger VFs.
2813
2814      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2815      no more expensive than old_loop_vinfo even after doubling the
2816      estimated old_loop_vinfo VF.  For all but trivial loops, this
2817      ensures that we only pick new_loop_vinfo if it is significantly
2818      better than old_loop_vinfo at the estimated VF.  */
2819
2820   if (est_rel_old_min != est_rel_new_min
2821       || est_rel_old_max != est_rel_new_max)
2822     {
2823       HOST_WIDE_INT est_rel_new_likely
2824         = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2825       HOST_WIDE_INT est_rel_old_likely
2826         = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2827
2828       return est_rel_new_likely * 2 <= est_rel_old_likely;
2829     }
2830
2831   /* If there's nothing to choose between the loop bodies, see whether
2832      there's a difference in the prologue and epilogue costs.  */
2833   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2834     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2835
2836   return false;
2837 }
2838
2839 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2840    true if we should.  */
2841
2842 static bool
2843 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2844                         loop_vec_info old_loop_vinfo)
2845 {
2846   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2847     return false;
2848
2849   if (dump_enabled_p ())
2850     dump_printf_loc (MSG_NOTE, vect_location,
2851                      "***** Preferring vector mode %s to vector mode %s\n",
2852                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2853                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2854   return true;
2855 }
2856
2857 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2858    try to reanalyze it as a main loop.  Return the loop_vinfo on success
2859    and null on failure.  */
2860
2861 static loop_vec_info
2862 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2863 {
2864   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2865     return loop_vinfo;
2866
2867   if (dump_enabled_p ())
2868     dump_printf_loc (MSG_NOTE, vect_location,
2869                      "***** Reanalyzing as a main loop with vector mode %s\n",
2870                      GET_MODE_NAME (loop_vinfo->vector_mode));
2871
2872   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2873   vec_info_shared *shared = loop_vinfo->shared;
2874   opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2875   gcc_assert (main_loop_vinfo);
2876
2877   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2878
2879   bool fatal = false;
2880   bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2881   loop->aux = NULL;
2882   if (!res)
2883     {
2884       if (dump_enabled_p ())
2885         dump_printf_loc (MSG_NOTE, vect_location,
2886                          "***** Failed to analyze main loop with vector"
2887                          " mode %s\n",
2888                          GET_MODE_NAME (loop_vinfo->vector_mode));
2889       delete main_loop_vinfo;
2890       return NULL;
2891     }
2892   LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2893   return main_loop_vinfo;
2894 }
2895
2896 /* Function vect_analyze_loop.
2897
2898    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2899    for it.  The different analyses will record information in the
2900    loop_vec_info struct.  */
2901 opt_loop_vec_info
2902 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2903 {
2904   auto_vector_modes vector_modes;
2905
2906   /* Autodetect first vector size we try.  */
2907   unsigned int autovec_flags
2908     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2909                                                     loop->simdlen != 0);
2910   unsigned int mode_i = 0;
2911
2912   DUMP_VECT_SCOPE ("analyze_loop_nest");
2913
2914   if (loop_outer (loop)
2915       && loop_vec_info_for_loop (loop_outer (loop))
2916       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2917     return opt_loop_vec_info::failure_at (vect_location,
2918                                           "outer-loop already vectorized.\n");
2919
2920   if (!find_loop_nest (loop, &shared->loop_nest))
2921     return opt_loop_vec_info::failure_at
2922       (vect_location,
2923        "not vectorized: loop nest containing two or more consecutive inner"
2924        " loops cannot be vectorized\n");
2925
2926   unsigned n_stmts = 0;
2927   machine_mode autodetected_vector_mode = VOIDmode;
2928   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2929   machine_mode next_vector_mode = VOIDmode;
2930   poly_uint64 lowest_th = 0;
2931   unsigned vectorized_loops = 0;
2932   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2933                              && !unlimited_cost_model (loop));
2934
2935   bool vect_epilogues = false;
2936   opt_result res = opt_result::success ();
2937   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2938   while (1)
2939     {
2940       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2941       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2942       if (!loop_vinfo)
2943         {
2944           if (dump_enabled_p ())
2945             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946                              "bad loop form.\n");
2947           gcc_checking_assert (first_loop_vinfo == NULL);
2948           return loop_vinfo;
2949         }
2950       loop_vinfo->vector_mode = next_vector_mode;
2951
2952       bool fatal = false;
2953
2954       /* When pick_lowest_cost_p is true, we should in principle iterate
2955          over all the loop_vec_infos that LOOP_VINFO could replace and
2956          try to vectorize LOOP_VINFO under the same conditions.
2957          E.g. when trying to replace an epilogue loop, we should vectorize
2958          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2959          to replace the main loop, we should vectorize LOOP_VINFO as a main
2960          loop too.
2961
2962          However, autovectorize_vector_modes is usually sorted as follows:
2963
2964          - Modes that naturally produce lower VFs usually follow modes that
2965            naturally produce higher VFs.
2966
2967          - When modes naturally produce the same VF, maskable modes
2968            usually follow unmaskable ones, so that the maskable mode
2969            can be used to vectorize the epilogue of the unmaskable mode.
2970
2971          This order is preferred because it leads to the maximum
2972          epilogue vectorization opportunities.  Targets should only use
2973          a different order if they want to make wide modes available while
2974          disparaging them relative to earlier, smaller modes.  The assumption
2975          in that case is that the wider modes are more expensive in some
2976          way that isn't reflected directly in the costs.
2977
2978          There should therefore be few interesting cases in which
2979          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2980          treated as a standalone loop, and ends up being genuinely cheaper
2981          than FIRST_LOOP_VINFO.  */
2982       if (vect_epilogues)
2983         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2984
2985       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2986       if (mode_i == 0)
2987         autodetected_vector_mode = loop_vinfo->vector_mode;
2988       if (dump_enabled_p ())
2989         {
2990           if (res)
2991             dump_printf_loc (MSG_NOTE, vect_location,
2992                              "***** Analysis succeeded with vector mode %s\n",
2993                              GET_MODE_NAME (loop_vinfo->vector_mode));
2994           else
2995             dump_printf_loc (MSG_NOTE, vect_location,
2996                              "***** Analysis failed with vector mode %s\n",
2997                              GET_MODE_NAME (loop_vinfo->vector_mode));
2998         }
2999
3000       loop->aux = NULL;
3001
3002       if (!fatal)
3003         while (mode_i < vector_modes.length ()
3004                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3005           {
3006             if (dump_enabled_p ())
3007               dump_printf_loc (MSG_NOTE, vect_location,
3008                                "***** The result for vector mode %s would"
3009                                " be the same\n",
3010                                GET_MODE_NAME (vector_modes[mode_i]));
3011             mode_i += 1;
3012           }
3013
3014       if (res)
3015         {
3016           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3017           vectorized_loops++;
3018
3019           /* Once we hit the desired simdlen for the first time,
3020              discard any previous attempts.  */
3021           if (simdlen
3022               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3023             {
3024               delete first_loop_vinfo;
3025               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3026               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3027               simdlen = 0;
3028             }
3029           else if (pick_lowest_cost_p && first_loop_vinfo)
3030             {
3031               /* Keep trying to roll back vectorization attempts while the
3032                  loop_vec_infos they produced were worse than this one.  */
3033               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3034               while (!vinfos.is_empty ()
3035                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3036                 {
3037                   gcc_assert (vect_epilogues);
3038                   delete vinfos.pop ();
3039                 }
3040               if (vinfos.is_empty ()
3041                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3042                 {
3043                   loop_vec_info main_loop_vinfo
3044                     = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3045                   if (main_loop_vinfo == loop_vinfo)
3046                     {
3047                       delete first_loop_vinfo;
3048                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3049                     }
3050                   else if (main_loop_vinfo
3051                            && vect_joust_loop_vinfos (main_loop_vinfo,
3052                                                       first_loop_vinfo))
3053                     {
3054                       delete first_loop_vinfo;
3055                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3056                       delete loop_vinfo;
3057                       loop_vinfo
3058                         = opt_loop_vec_info::success (main_loop_vinfo);
3059                     }
3060                   else
3061                     delete main_loop_vinfo;
3062                 }
3063             }
3064
3065           if (first_loop_vinfo == NULL)
3066             {
3067               first_loop_vinfo = loop_vinfo;
3068               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3069             }
3070           else if (vect_epilogues
3071                    /* For now only allow one epilogue loop.  */
3072                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
3073             {
3074               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3075               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3076               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3077                           || maybe_ne (lowest_th, 0U));
3078               /* Keep track of the known smallest versioning
3079                  threshold.  */
3080               if (ordered_p (lowest_th, th))
3081                 lowest_th = ordered_min (lowest_th, th);
3082             }
3083           else
3084             {
3085               delete loop_vinfo;
3086               loop_vinfo = opt_loop_vec_info::success (NULL);
3087             }
3088
3089           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3090              enabled, SIMDUID is not set, it is the innermost loop and we have
3091              either already found the loop's SIMDLEN or there was no SIMDLEN to
3092              begin with.
3093              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3094           vect_epilogues = (!simdlen
3095                             && loop->inner == NULL
3096                             && param_vect_epilogues_nomask
3097                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3098                             && !loop->simduid
3099                             /* For now only allow one epilogue loop, but allow
3100                                pick_lowest_cost_p to replace it.  */
3101                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3102                                 || pick_lowest_cost_p));
3103
3104           /* Commit to first_loop_vinfo if we have no reason to try
3105              alternatives.  */
3106           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3107             break;
3108         }
3109       else
3110         {
3111           delete loop_vinfo;
3112           loop_vinfo = opt_loop_vec_info::success (NULL);
3113           if (fatal)
3114             {
3115               gcc_checking_assert (first_loop_vinfo == NULL);
3116               break;
3117             }
3118         }
3119
3120       /* Handle the case that the original loop can use partial
3121          vectorization, but want to only adopt it for the epilogue.
3122          The retry should be in the same mode as original.  */
3123       if (vect_epilogues
3124           && loop_vinfo
3125           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3126         {
3127           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3128                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3129           if (dump_enabled_p ())
3130             dump_printf_loc (MSG_NOTE, vect_location,
3131                              "***** Re-trying analysis with same vector mode"
3132                              " %s for epilogue with partial vectors.\n",
3133                              GET_MODE_NAME (loop_vinfo->vector_mode));
3134           continue;
3135         }
3136
3137       if (mode_i < vector_modes.length ()
3138           && VECTOR_MODE_P (autodetected_vector_mode)
3139           && (related_vector_mode (vector_modes[mode_i],
3140                                    GET_MODE_INNER (autodetected_vector_mode))
3141               == autodetected_vector_mode)
3142           && (related_vector_mode (autodetected_vector_mode,
3143                                    GET_MODE_INNER (vector_modes[mode_i]))
3144               == vector_modes[mode_i]))
3145         {
3146           if (dump_enabled_p ())
3147             dump_printf_loc (MSG_NOTE, vect_location,
3148                              "***** Skipping vector mode %s, which would"
3149                              " repeat the analysis for %s\n",
3150                              GET_MODE_NAME (vector_modes[mode_i]),
3151                              GET_MODE_NAME (autodetected_vector_mode));
3152           mode_i += 1;
3153         }
3154
3155       if (mode_i == vector_modes.length ()
3156           || autodetected_vector_mode == VOIDmode)
3157         break;
3158
3159       /* Try the next biggest vector size.  */
3160       next_vector_mode = vector_modes[mode_i++];
3161       if (dump_enabled_p ())
3162         dump_printf_loc (MSG_NOTE, vect_location,
3163                          "***** Re-trying analysis with vector mode %s\n",
3164                          GET_MODE_NAME (next_vector_mode));
3165     }
3166
3167   if (first_loop_vinfo)
3168     {
3169       loop->aux = (loop_vec_info) first_loop_vinfo;
3170       if (dump_enabled_p ())
3171         dump_printf_loc (MSG_NOTE, vect_location,
3172                          "***** Choosing vector mode %s\n",
3173                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3174       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3175       return first_loop_vinfo;
3176     }
3177
3178   return opt_loop_vec_info::propagate_failure (res);
3179 }
3180
3181 /* Return true if there is an in-order reduction function for CODE, storing
3182    it in *REDUC_FN if so.  */
3183
3184 static bool
3185 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3186 {
3187   switch (code)
3188     {
3189     case PLUS_EXPR:
3190       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3191       return true;
3192
3193     default:
3194       return false;
3195     }
3196 }
3197
3198 /* Function reduction_fn_for_scalar_code
3199
3200    Input:
3201    CODE - tree_code of a reduction operations.
3202
3203    Output:
3204    REDUC_FN - the corresponding internal function to be used to reduce the
3205       vector of partial results into a single scalar result, or IFN_LAST
3206       if the operation is a supported reduction operation, but does not have
3207       such an internal function.
3208
3209    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3210
3211 static bool
3212 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3213 {
3214   switch (code)
3215     {
3216       case MAX_EXPR:
3217         *reduc_fn = IFN_REDUC_MAX;
3218         return true;
3219
3220       case MIN_EXPR:
3221         *reduc_fn = IFN_REDUC_MIN;
3222         return true;
3223
3224       case PLUS_EXPR:
3225         *reduc_fn = IFN_REDUC_PLUS;
3226         return true;
3227
3228       case BIT_AND_EXPR:
3229         *reduc_fn = IFN_REDUC_AND;
3230         return true;
3231
3232       case BIT_IOR_EXPR:
3233         *reduc_fn = IFN_REDUC_IOR;
3234         return true;
3235
3236       case BIT_XOR_EXPR:
3237         *reduc_fn = IFN_REDUC_XOR;
3238         return true;
3239
3240       case MULT_EXPR:
3241       case MINUS_EXPR:
3242         *reduc_fn = IFN_LAST;
3243         return true;
3244
3245       default:
3246        return false;
3247     }
3248 }
3249
3250 /* If there is a neutral value X such that SLP reduction NODE would not
3251    be affected by the introduction of additional X elements, return that X,
3252    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3253    is the vector type that would hold element X.  REDUC_CHAIN is true if
3254    the SLP statements perform a single reduction, false if each statement
3255    performs an independent reduction.  */
3256
3257 static tree
3258 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3259                               tree_code code, bool reduc_chain)
3260 {
3261   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3262   stmt_vec_info stmt_vinfo = stmts[0];
3263   tree scalar_type = TREE_TYPE (vector_type);
3264   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3265   gcc_assert (loop);
3266
3267   switch (code)
3268     {
3269     case WIDEN_SUM_EXPR:
3270     case DOT_PROD_EXPR:
3271     case SAD_EXPR:
3272     case PLUS_EXPR:
3273     case MINUS_EXPR:
3274     case BIT_IOR_EXPR:
3275     case BIT_XOR_EXPR:
3276       return build_zero_cst (scalar_type);
3277
3278     case MULT_EXPR:
3279       return build_one_cst (scalar_type);
3280
3281     case BIT_AND_EXPR:
3282       return build_all_ones_cst (scalar_type);
3283
3284     case MAX_EXPR:
3285     case MIN_EXPR:
3286       /* For MIN/MAX the initial values are neutral.  A reduction chain
3287          has only a single initial value, so that value is neutral for
3288          all statements.  */
3289       if (reduc_chain)
3290         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3291                                       loop_preheader_edge (loop));
3292       return NULL_TREE;
3293
3294     default:
3295       return NULL_TREE;
3296     }
3297 }
3298
3299 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3300    STMT is printed with a message MSG. */
3301
3302 static void
3303 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3304 {
3305   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3306 }
3307
3308 /* Return true if we need an in-order reduction for operation CODE
3309    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3310    overflow must wrap.  */
3311
3312 bool
3313 needs_fold_left_reduction_p (tree type, tree_code code)
3314 {
3315   /* CHECKME: check for !flag_finite_math_only too?  */
3316   if (SCALAR_FLOAT_TYPE_P (type))
3317     switch (code)
3318       {
3319       case MIN_EXPR:
3320       case MAX_EXPR:
3321         return false;
3322
3323       default:
3324         return !flag_associative_math;
3325       }
3326
3327   if (INTEGRAL_TYPE_P (type))
3328     {
3329       if (!operation_no_trapping_overflow (type, code))
3330         return true;
3331       return false;
3332     }
3333
3334   if (SAT_FIXED_POINT_TYPE_P (type))
3335     return true;
3336
3337   return false;
3338 }
3339
3340 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3341    has a handled computation expression.  Store the main reduction
3342    operation in *CODE.  */
3343
3344 static bool
3345 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3346                       tree loop_arg, enum tree_code *code,
3347                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3348 {
3349   auto_bitmap visited;
3350   tree lookfor = PHI_RESULT (phi);
3351   ssa_op_iter curri;
3352   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3353   while (USE_FROM_PTR (curr) != loop_arg)
3354     curr = op_iter_next_use (&curri);
3355   curri.i = curri.numops;
3356   do
3357     {
3358       path.safe_push (std::make_pair (curri, curr));
3359       tree use = USE_FROM_PTR (curr);
3360       if (use == lookfor)
3361         break;
3362       gimple *def = SSA_NAME_DEF_STMT (use);
3363       if (gimple_nop_p (def)
3364           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3365         {
3366 pop:
3367           do
3368             {
3369               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3370               curri = x.first;
3371               curr = x.second;
3372               do
3373                 curr = op_iter_next_use (&curri);
3374               /* Skip already visited or non-SSA operands (from iterating
3375                  over PHI args).  */
3376               while (curr != NULL_USE_OPERAND_P
3377                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3378                          || ! bitmap_set_bit (visited,
3379                                               SSA_NAME_VERSION
3380                                                 (USE_FROM_PTR (curr)))));
3381             }
3382           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3383           if (curr == NULL_USE_OPERAND_P)
3384             break;
3385         }
3386       else
3387         {
3388           if (gimple_code (def) == GIMPLE_PHI)
3389             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3390           else
3391             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3392           while (curr != NULL_USE_OPERAND_P
3393                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3394                      || ! bitmap_set_bit (visited,
3395                                           SSA_NAME_VERSION
3396                                             (USE_FROM_PTR (curr)))))
3397             curr = op_iter_next_use (&curri);
3398           if (curr == NULL_USE_OPERAND_P)
3399             goto pop;
3400         }
3401     }
3402   while (1);
3403   if (dump_file && (dump_flags & TDF_DETAILS))
3404     {
3405       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3406       unsigned i;
3407       std::pair<ssa_op_iter, use_operand_p> *x;
3408       FOR_EACH_VEC_ELT (path, i, x)
3409         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3410       dump_printf (MSG_NOTE, "\n");
3411     }
3412
3413   /* Check whether the reduction path detected is valid.  */
3414   bool fail = path.length () == 0;
3415   bool neg = false;
3416   int sign = -1;
3417   *code = ERROR_MARK;
3418   for (unsigned i = 1; i < path.length (); ++i)
3419     {
3420       gimple *use_stmt = USE_STMT (path[i].second);
3421       tree op = USE_FROM_PTR (path[i].second);
3422       if (! is_gimple_assign (use_stmt)
3423           /* The following make sure we can compute the operand index
3424              easily plus it mostly disallows chaining via COND_EXPR condition
3425              operands.  */
3426           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3427               && (gimple_num_ops (use_stmt) <= 2
3428                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3429               && (gimple_num_ops (use_stmt) <= 3
3430                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3431         {
3432           fail = true;
3433           break;
3434         }
3435       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3436       if (use_code == MINUS_EXPR)
3437         {
3438           use_code = PLUS_EXPR;
3439           /* Track whether we negate the reduction value each iteration.  */
3440           if (gimple_assign_rhs2 (use_stmt) == op)
3441             neg = ! neg;
3442         }
3443       if (CONVERT_EXPR_CODE_P (use_code)
3444           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3445                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3446         ;
3447       else if (*code == ERROR_MARK)
3448         {
3449           *code = use_code;
3450           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3451         }
3452       else if (use_code != *code)
3453         {
3454           fail = true;
3455           break;
3456         }
3457       else if ((use_code == MIN_EXPR
3458                 || use_code == MAX_EXPR)
3459                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3460         {
3461           fail = true;
3462           break;
3463         }
3464       /* Check there's only a single stmt the op is used on.  For the
3465          not value-changing tail and the last stmt allow out-of-loop uses.
3466          ???  We could relax this and handle arbitrary live stmts by
3467          forcing a scalar epilogue for example.  */
3468       imm_use_iterator imm_iter;
3469       gimple *op_use_stmt;
3470       unsigned cnt = 0;
3471       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3472         if (!is_gimple_debug (op_use_stmt)
3473             && (*code != ERROR_MARK
3474                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3475           {
3476             /* We want to allow x + x but not x < 1 ? x : 2.  */
3477             if (is_gimple_assign (op_use_stmt)
3478                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3479               {
3480                 use_operand_p use_p;
3481                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3482                   cnt++;
3483               }
3484             else
3485               cnt++;
3486           }
3487       if (cnt != 1)
3488         {
3489           fail = true;
3490           break;
3491         }
3492     }
3493   return ! fail && ! neg && *code != ERROR_MARK;
3494 }
3495
3496 bool
3497 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3498                       tree loop_arg, enum tree_code code)
3499 {
3500   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3501   enum tree_code code_;
3502   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3503           && code_ == code);
3504 }
3505
3506
3507
3508 /* Function vect_is_simple_reduction
3509
3510    (1) Detect a cross-iteration def-use cycle that represents a simple
3511    reduction computation.  We look for the following pattern:
3512
3513    loop_header:
3514      a1 = phi < a0, a2 >
3515      a3 = ...
3516      a2 = operation (a3, a1)
3517
3518    or
3519
3520    a3 = ...
3521    loop_header:
3522      a1 = phi < a0, a2 >
3523      a2 = operation (a3, a1)
3524
3525    such that:
3526    1. operation is commutative and associative and it is safe to
3527       change the order of the computation
3528    2. no uses for a2 in the loop (a2 is used out of the loop)
3529    3. no uses of a1 in the loop besides the reduction operation
3530    4. no uses of a1 outside the loop.
3531
3532    Conditions 1,4 are tested here.
3533    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3534
3535    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3536    nested cycles.
3537
3538    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3539    reductions:
3540
3541      a1 = phi < a0, a2 >
3542      inner loop (def of a3)
3543      a2 = phi < a3 >
3544
3545    (4) Detect condition expressions, ie:
3546      for (int i = 0; i < N; i++)
3547        if (a[i] < val)
3548         ret_val = a[i];
3549
3550 */
3551
3552 static stmt_vec_info
3553 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3554                           bool *double_reduc, bool *reduc_chain_p)
3555 {
3556   gphi *phi = as_a <gphi *> (phi_info->stmt);
3557   gimple *phi_use_stmt = NULL;
3558   imm_use_iterator imm_iter;
3559   use_operand_p use_p;
3560
3561   *double_reduc = false;
3562   *reduc_chain_p = false;
3563   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3564
3565   tree phi_name = PHI_RESULT (phi);
3566   /* ???  If there are no uses of the PHI result the inner loop reduction
3567      won't be detected as possibly double-reduction by vectorizable_reduction
3568      because that tries to walk the PHI arg from the preheader edge which
3569      can be constant.  See PR60382.  */
3570   if (has_zero_uses (phi_name))
3571     return NULL;
3572   class loop *loop = (gimple_bb (phi))->loop_father;
3573   unsigned nphi_def_loop_uses = 0;
3574   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3575     {
3576       gimple *use_stmt = USE_STMT (use_p);
3577       if (is_gimple_debug (use_stmt))
3578         continue;
3579
3580       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3581         {
3582           if (dump_enabled_p ())
3583             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584                              "intermediate value used outside loop.\n");
3585
3586           return NULL;
3587         }
3588
3589       nphi_def_loop_uses++;
3590       phi_use_stmt = use_stmt;
3591     }
3592
3593   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3594   if (TREE_CODE (latch_def) != SSA_NAME)
3595     {
3596       if (dump_enabled_p ())
3597         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598                          "reduction: not ssa_name: %T\n", latch_def);
3599       return NULL;
3600     }
3601
3602   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3603   if (!def_stmt_info
3604       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3605     return NULL;
3606
3607   bool nested_in_vect_loop
3608     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3609   unsigned nlatch_def_loop_uses = 0;
3610   auto_vec<gphi *, 3> lcphis;
3611   bool inner_loop_of_double_reduc = false;
3612   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3613     {
3614       gimple *use_stmt = USE_STMT (use_p);
3615       if (is_gimple_debug (use_stmt))
3616         continue;
3617       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3618         nlatch_def_loop_uses++;
3619       else
3620         {
3621           /* We can have more than one loop-closed PHI.  */
3622           lcphis.safe_push (as_a <gphi *> (use_stmt));
3623           if (nested_in_vect_loop
3624               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3625                   == vect_double_reduction_def))
3626             inner_loop_of_double_reduc = true;
3627         }
3628     }
3629
3630   /* If we are vectorizing an inner reduction we are executing that
3631      in the original order only in case we are not dealing with a
3632      double reduction.  */
3633   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3634     {
3635       if (dump_enabled_p ())
3636         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3637                         "detected nested cycle: ");
3638       return def_stmt_info;
3639     }
3640
3641   /* If this isn't a nested cycle or if the nested cycle reduction value
3642      is used ouside of the inner loop we cannot handle uses of the reduction
3643      value.  */
3644   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3645     {
3646       if (dump_enabled_p ())
3647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648                          "reduction used in loop.\n");
3649       return NULL;
3650     }
3651
3652   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3653      defined in the inner loop.  */
3654   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3655     {
3656       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3657       if (gimple_phi_num_args (def_stmt) != 1
3658           || TREE_CODE (op1) != SSA_NAME)
3659         {
3660           if (dump_enabled_p ())
3661             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662                              "unsupported phi node definition.\n");
3663
3664           return NULL;
3665         }
3666
3667       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3668       if (gimple_bb (def1)
3669           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3670           && loop->inner
3671           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3672           && is_gimple_assign (def1)
3673           && is_a <gphi *> (phi_use_stmt)
3674           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3675         {
3676           if (dump_enabled_p ())
3677             report_vect_op (MSG_NOTE, def_stmt,
3678                             "detected double reduction: ");
3679
3680           *double_reduc = true;
3681           return def_stmt_info;
3682         }
3683
3684       return NULL;
3685     }
3686
3687   /* Look for the expression computing latch_def from then loop PHI result.  */
3688   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3689   enum tree_code code;
3690   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3691                             path))
3692     {
3693       STMT_VINFO_REDUC_CODE (phi_info) = code;
3694       if (code == COND_EXPR && !nested_in_vect_loop)
3695         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3696
3697       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3698          reduction chain for which the additional restriction is that
3699          all operations in the chain are the same.  */
3700       auto_vec<stmt_vec_info, 8> reduc_chain;
3701       unsigned i;
3702       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3703       for (i = path.length () - 1; i >= 1; --i)
3704         {
3705           gimple *stmt = USE_STMT (path[i].second);
3706           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3707           STMT_VINFO_REDUC_IDX (stmt_info)
3708             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3709           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3710           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3711                                      && (i == 1 || i == path.length () - 1));
3712           if ((stmt_code != code && !leading_conversion)
3713               /* We can only handle the final value in epilogue
3714                  generation for reduction chains.  */
3715               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3716             is_slp_reduc = false;
3717           /* For reduction chains we support a trailing/leading
3718              conversions.  We do not store those in the actual chain.  */
3719           if (leading_conversion)
3720             continue;
3721           reduc_chain.safe_push (stmt_info);
3722         }
3723       if (is_slp_reduc && reduc_chain.length () > 1)
3724         {
3725           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3726             {
3727               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3728               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3729             }
3730           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3731           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3732
3733           /* Save the chain for further analysis in SLP detection.  */
3734           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3735           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3736
3737           *reduc_chain_p = true;
3738           if (dump_enabled_p ())
3739             dump_printf_loc (MSG_NOTE, vect_location,
3740                             "reduction: detected reduction chain\n");
3741         }
3742       else if (dump_enabled_p ())
3743         dump_printf_loc (MSG_NOTE, vect_location,
3744                          "reduction: detected reduction\n");
3745
3746       return def_stmt_info;
3747     }
3748
3749   if (dump_enabled_p ())
3750     dump_printf_loc (MSG_NOTE, vect_location,
3751                      "reduction: unknown pattern\n");
3752
3753   return NULL;
3754 }
3755
3756 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3757    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3758    or -1 if not known.  */
3759
3760 static int
3761 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3762 {
3763   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3764   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3765     {
3766       if (dump_enabled_p ())
3767         dump_printf_loc (MSG_NOTE, vect_location,
3768                          "cost model: epilogue peel iters set to vf/2 "
3769                          "because loop iterations are unknown .\n");
3770       return assumed_vf / 2;
3771     }
3772   else
3773     {
3774       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3775       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3776       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3777       /* If we need to peel for gaps, but no peeling is required, we have to
3778          peel VF iterations.  */
3779       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3780         peel_iters_epilogue = assumed_vf;
3781       return peel_iters_epilogue;
3782     }
3783 }
3784
3785 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3786 int
3787 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3788                              int *peel_iters_epilogue,
3789                              stmt_vector_for_cost *scalar_cost_vec,
3790                              stmt_vector_for_cost *prologue_cost_vec,
3791                              stmt_vector_for_cost *epilogue_cost_vec)
3792 {
3793   int retval = 0;
3794
3795   *peel_iters_epilogue
3796     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3797
3798   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3799     {
3800       /* If peeled iterations are known but number of scalar loop
3801          iterations are unknown, count a taken branch per peeled loop.  */
3802       if (peel_iters_prologue > 0)
3803         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3804                                    NULL, NULL_TREE, 0, vect_prologue);
3805       if (*peel_iters_epilogue > 0)
3806         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3807                                     NULL, NULL_TREE, 0, vect_epilogue);
3808     }
3809
3810   stmt_info_for_cost *si;
3811   int j;
3812   if (peel_iters_prologue)
3813     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3814       retval += record_stmt_cost (prologue_cost_vec,
3815                                   si->count * peel_iters_prologue,
3816                                   si->kind, si->stmt_info, si->misalign,
3817                                   vect_prologue);
3818   if (*peel_iters_epilogue)
3819     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3820       retval += record_stmt_cost (epilogue_cost_vec,
3821                                   si->count * *peel_iters_epilogue,
3822                                   si->kind, si->stmt_info, si->misalign,
3823                                   vect_epilogue);
3824
3825   return retval;
3826 }
3827
3828 /* Function vect_estimate_min_profitable_iters
3829
3830    Return the number of iterations required for the vector version of the
3831    loop to be profitable relative to the cost of the scalar version of the
3832    loop.
3833
3834    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3835    of iterations for vectorization.  -1 value means loop vectorization
3836    is not profitable.  This returned value may be used for dynamic
3837    profitability check.
3838
3839    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3840    for static check against estimated number of iterations.  */
3841
3842 static void
3843 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3844                                     int *ret_min_profitable_niters,
3845                                     int *ret_min_profitable_estimate)
3846 {
3847   int min_profitable_iters;
3848   int min_profitable_estimate;
3849   int peel_iters_prologue;
3850   int peel_iters_epilogue;
3851   unsigned vec_inside_cost = 0;
3852   int vec_outside_cost = 0;
3853   unsigned vec_prologue_cost = 0;
3854   unsigned vec_epilogue_cost = 0;
3855   int scalar_single_iter_cost = 0;
3856   int scalar_outside_cost = 0;
3857   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3858   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3859   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3860
3861   /* Cost model disabled.  */
3862   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3863     {
3864       if (dump_enabled_p ())
3865         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3866       *ret_min_profitable_niters = 0;
3867       *ret_min_profitable_estimate = 0;
3868       return;
3869     }
3870
3871   /* Requires loop versioning tests to handle misalignment.  */
3872   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3873     {
3874       /*  FIXME: Make cost depend on complexity of individual check.  */
3875       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3876       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3877                             NULL, NULL_TREE, 0, vect_prologue);
3878       if (dump_enabled_p ())
3879         dump_printf (MSG_NOTE,
3880                      "cost model: Adding cost of checks for loop "
3881                      "versioning to treat misalignment.\n");
3882     }
3883
3884   /* Requires loop versioning with alias checks.  */
3885   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3886     {
3887       /*  FIXME: Make cost depend on complexity of individual check.  */
3888       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3889       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3890                             NULL, NULL_TREE, 0, vect_prologue);
3891       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3892       if (len)
3893         /* Count LEN - 1 ANDs and LEN comparisons.  */
3894         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3895                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3896       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3897       if (len)
3898         {
3899           /* Count LEN - 1 ANDs and LEN comparisons.  */
3900           unsigned int nstmts = len * 2 - 1;
3901           /* +1 for each bias that needs adding.  */
3902           for (unsigned int i = 0; i < len; ++i)
3903             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3904               nstmts += 1;
3905           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3906                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3907         }
3908       if (dump_enabled_p ())
3909         dump_printf (MSG_NOTE,
3910                      "cost model: Adding cost of checks for loop "
3911                      "versioning aliasing.\n");
3912     }
3913
3914   /* Requires loop versioning with niter checks.  */
3915   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3916     {
3917       /*  FIXME: Make cost depend on complexity of individual check.  */
3918       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3919                             NULL, NULL_TREE, 0, vect_prologue);
3920       if (dump_enabled_p ())
3921         dump_printf (MSG_NOTE,
3922                      "cost model: Adding cost of checks for loop "
3923                      "versioning niters.\n");
3924     }
3925
3926   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3927     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3928                           NULL, NULL_TREE, 0, vect_prologue);
3929
3930   /* Count statements in scalar loop.  Using this as scalar cost for a single
3931      iteration for now.
3932
3933      TODO: Add outer loop support.
3934
3935      TODO: Consider assigning different costs to different scalar
3936      statements.  */
3937
3938   scalar_single_iter_cost
3939     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3940
3941   /* Add additional cost for the peeled instructions in prologue and epilogue
3942      loop.  (For fully-masked loops there will be no peeling.)
3943
3944      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3945      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3946
3947      TODO: Build an expression that represents peel_iters for prologue and
3948      epilogue to be used in a run-time test.  */
3949
3950   bool prologue_need_br_taken_cost = false;
3951   bool prologue_need_br_not_taken_cost = false;
3952
3953   /* Calculate peel_iters_prologue.  */
3954   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3955     peel_iters_prologue = 0;
3956   else if (npeel < 0)
3957     {
3958       peel_iters_prologue = assumed_vf / 2;
3959       if (dump_enabled_p ())
3960         dump_printf (MSG_NOTE, "cost model: "
3961                      "prologue peel iters set to vf/2.\n");
3962
3963       /* If peeled iterations are unknown, count a taken branch and a not taken
3964          branch per peeled loop.  Even if scalar loop iterations are known,
3965          vector iterations are not known since peeled prologue iterations are
3966          not known.  Hence guards remain the same.  */
3967       prologue_need_br_taken_cost = true;
3968       prologue_need_br_not_taken_cost = true;
3969     }
3970   else
3971     {
3972       peel_iters_prologue = npeel;
3973       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3974         /* If peeled iterations are known but number of scalar loop
3975            iterations are unknown, count a taken branch per peeled loop.  */
3976         prologue_need_br_taken_cost = true;
3977     }
3978
3979   bool epilogue_need_br_taken_cost = false;
3980   bool epilogue_need_br_not_taken_cost = false;
3981
3982   /* Calculate peel_iters_epilogue.  */
3983   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3984     /* We need to peel exactly one iteration for gaps.  */
3985     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3986   else if (npeel < 0)
3987     {
3988       /* If peeling for alignment is unknown, loop bound of main loop
3989          becomes unknown.  */
3990       peel_iters_epilogue = assumed_vf / 2;
3991       if (dump_enabled_p ())
3992         dump_printf (MSG_NOTE, "cost model: "
3993                      "epilogue peel iters set to vf/2 because "
3994                      "peeling for alignment is unknown.\n");
3995
3996       /* See the same reason above in peel_iters_prologue calculation.  */
3997       epilogue_need_br_taken_cost = true;
3998       epilogue_need_br_not_taken_cost = true;
3999     }
4000   else
4001     {
4002       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4003       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4004         /* If peeled iterations are known but number of scalar loop
4005            iterations are unknown, count a taken branch per peeled loop.  */
4006         epilogue_need_br_taken_cost = true;
4007     }
4008
4009   stmt_info_for_cost *si;
4010   int j;
4011   /* Add costs associated with peel_iters_prologue.  */
4012   if (peel_iters_prologue)
4013     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4014       {
4015         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4016                               si->count * peel_iters_prologue, si->kind,
4017                               si->stmt_info, si->vectype, si->misalign,
4018                               vect_prologue);
4019       }
4020
4021   /* Add costs associated with peel_iters_epilogue.  */
4022   if (peel_iters_epilogue)
4023     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4024       {
4025         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4026                               si->count * peel_iters_epilogue, si->kind,
4027                               si->stmt_info, si->vectype, si->misalign,
4028                               vect_epilogue);
4029       }
4030
4031   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4032
4033   if (prologue_need_br_taken_cost)
4034     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4035                           NULL, NULL_TREE, 0, vect_prologue);
4036
4037   if (prologue_need_br_not_taken_cost)
4038     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4039                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4040                           vect_prologue);
4041
4042   if (epilogue_need_br_taken_cost)
4043     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4044                           NULL, NULL_TREE, 0, vect_epilogue);
4045
4046   if (epilogue_need_br_not_taken_cost)
4047     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4048                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4049                           vect_epilogue);
4050
4051   /* Take care of special costs for rgroup controls of partial vectors.  */
4052   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4053     {
4054       /* Calculate how many masks we need to generate.  */
4055       unsigned int num_masks = 0;
4056       rgroup_controls *rgm;
4057       unsigned int num_vectors_m1;
4058       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4059         if (rgm->type)
4060           num_masks += num_vectors_m1 + 1;
4061       gcc_assert (num_masks > 0);
4062
4063       /* In the worst case, we need to generate each mask in the prologue
4064          and in the loop body.  One of the loop body mask instructions
4065          replaces the comparison in the scalar loop, and since we don't
4066          count the scalar comparison against the scalar body, we shouldn't
4067          count that vector instruction against the vector body either.
4068
4069          Sometimes we can use unpacks instead of generating prologue
4070          masks and sometimes the prologue mask will fold to a constant,
4071          so the actual prologue cost might be smaller.  However, it's
4072          simpler and safer to use the worst-case cost; if this ends up
4073          being the tie-breaker between vectorizing or not, then it's
4074          probably better not to vectorize.  */
4075       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4076                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4077       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4078                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4079     }
4080   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4081     {
4082       /* Referring to the functions vect_set_loop_condition_partial_vectors
4083          and vect_set_loop_controls_directly, we need to generate each
4084          length in the prologue and in the loop body if required. Although
4085          there are some possible optimizations, we consider the worst case
4086          here.  */
4087
4088       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4089       bool need_iterate_p
4090         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4091            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4092
4093       /* Calculate how many statements to be added.  */
4094       unsigned int prologue_stmts = 0;
4095       unsigned int body_stmts = 0;
4096
4097       rgroup_controls *rgc;
4098       unsigned int num_vectors_m1;
4099       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4100         if (rgc->type)
4101           {
4102             /* May need one SHIFT for nitems_total computation.  */
4103             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4104             if (nitems != 1 && !niters_known_p)
4105               prologue_stmts += 1;
4106
4107             /* May need one MAX and one MINUS for wrap around.  */
4108             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4109               prologue_stmts += 2;
4110
4111             /* Need one MAX and one MINUS for each batch limit excepting for
4112                the 1st one.  */
4113             prologue_stmts += num_vectors_m1 * 2;
4114
4115             unsigned int num_vectors = num_vectors_m1 + 1;
4116
4117             /* Need to set up lengths in prologue, only one MIN required
4118                for each since start index is zero.  */
4119             prologue_stmts += num_vectors;
4120
4121             /* Each may need two MINs and one MINUS to update lengths in body
4122                for next iteration.  */
4123             if (need_iterate_p)
4124               body_stmts += 3 * num_vectors;
4125           }
4126
4127       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4128                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4129       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4130                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4131     }
4132
4133   /* FORNOW: The scalar outside cost is incremented in one of the
4134      following ways:
4135
4136      1. The vectorizer checks for alignment and aliasing and generates
4137      a condition that allows dynamic vectorization.  A cost model
4138      check is ANDED with the versioning condition.  Hence scalar code
4139      path now has the added cost of the versioning check.
4140
4141        if (cost > th & versioning_check)
4142          jmp to vector code
4143
4144      Hence run-time scalar is incremented by not-taken branch cost.
4145
4146      2. The vectorizer then checks if a prologue is required.  If the
4147      cost model check was not done before during versioning, it has to
4148      be done before the prologue check.
4149
4150        if (cost <= th)
4151          prologue = scalar_iters
4152        if (prologue == 0)
4153          jmp to vector code
4154        else
4155          execute prologue
4156        if (prologue == num_iters)
4157          go to exit
4158
4159      Hence the run-time scalar cost is incremented by a taken branch,
4160      plus a not-taken branch, plus a taken branch cost.
4161
4162      3. The vectorizer then checks if an epilogue is required.  If the
4163      cost model check was not done before during prologue check, it
4164      has to be done with the epilogue check.
4165
4166        if (prologue == 0)
4167          jmp to vector code
4168        else
4169          execute prologue
4170        if (prologue == num_iters)
4171          go to exit
4172        vector code:
4173          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4174            jmp to epilogue
4175
4176      Hence the run-time scalar cost should be incremented by 2 taken
4177      branches.
4178
4179      TODO: The back end may reorder the BBS's differently and reverse
4180      conditions/branch directions.  Change the estimates below to
4181      something more reasonable.  */
4182
4183   /* If the number of iterations is known and we do not do versioning, we can
4184      decide whether to vectorize at compile time.  Hence the scalar version
4185      do not carry cost model guard costs.  */
4186   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4187       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4188     {
4189       /* Cost model check occurs at versioning.  */
4190       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4191         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4192       else
4193         {
4194           /* Cost model check occurs at prologue generation.  */
4195           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4196             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4197               + vect_get_stmt_cost (cond_branch_not_taken);
4198           /* Cost model check occurs at epilogue generation.  */
4199           else
4200             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4201         }
4202     }
4203
4204   /* Complete the target-specific cost calculations.  */
4205   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4206                &vec_inside_cost, &vec_epilogue_cost);
4207
4208   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4209
4210   /* Stash the costs so that we can compare two loop_vec_infos.  */
4211   loop_vinfo->vec_inside_cost = vec_inside_cost;
4212   loop_vinfo->vec_outside_cost = vec_outside_cost;
4213
4214   if (dump_enabled_p ())
4215     {
4216       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4217       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4218                    vec_inside_cost);
4219       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4220                    vec_prologue_cost);
4221       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4222                    vec_epilogue_cost);
4223       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4224                    scalar_single_iter_cost);
4225       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4226                    scalar_outside_cost);
4227       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4228                    vec_outside_cost);
4229       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4230                    peel_iters_prologue);
4231       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4232                    peel_iters_epilogue);
4233     }
4234
4235   /* Calculate number of iterations required to make the vector version
4236      profitable, relative to the loop bodies only.  The following condition
4237      must hold true:
4238      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4239      where
4240      SIC = scalar iteration cost, VIC = vector iteration cost,
4241      VOC = vector outside cost, VF = vectorization factor,
4242      NPEEL = prologue iterations + epilogue iterations,
4243      SOC = scalar outside cost for run time cost model check.  */
4244
4245   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4246                           - vec_inside_cost);
4247   if (saving_per_viter <= 0)
4248     {
4249       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4250         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4251                     "vectorization did not happen for a simd loop");
4252
4253       if (dump_enabled_p ())
4254         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4255                          "cost model: the vector iteration cost = %d "
4256                          "divided by the scalar iteration cost = %d "
4257                          "is greater or equal to the vectorization factor = %d"
4258                          ".\n",
4259                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4260       *ret_min_profitable_niters = -1;
4261       *ret_min_profitable_estimate = -1;
4262       return;
4263     }
4264
4265   /* ??? The "if" arm is written to handle all cases; see below for what
4266      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4267   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4268     {
4269       /* Rewriting the condition above in terms of the number of
4270          vector iterations (vniters) rather than the number of
4271          scalar iterations (niters) gives:
4272
4273          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4274
4275          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4276
4277          For integer N, X and Y when X > 0:
4278
4279          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4280       int outside_overhead = (vec_outside_cost
4281                               - scalar_single_iter_cost * peel_iters_prologue
4282                               - scalar_single_iter_cost * peel_iters_epilogue
4283                               - scalar_outside_cost);
4284       /* We're only interested in cases that require at least one
4285          vector iteration.  */
4286       int min_vec_niters = 1;
4287       if (outside_overhead > 0)
4288         min_vec_niters = outside_overhead / saving_per_viter + 1;
4289
4290       if (dump_enabled_p ())
4291         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4292                      min_vec_niters);
4293
4294       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4295         {
4296           /* Now that we know the minimum number of vector iterations,
4297              find the minimum niters for which the scalar cost is larger:
4298
4299              SIC * niters > VIC * vniters + VOC - SOC
4300
4301              We know that the minimum niters is no more than
4302              vniters * VF + NPEEL, but it might be (and often is) less
4303              than that if a partial vector iteration is cheaper than the
4304              equivalent scalar code.  */
4305           int threshold = (vec_inside_cost * min_vec_niters
4306                            + vec_outside_cost
4307                            - scalar_outside_cost);
4308           if (threshold <= 0)
4309             min_profitable_iters = 1;
4310           else
4311             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4312         }
4313       else
4314         /* Convert the number of vector iterations into a number of
4315            scalar iterations.  */
4316         min_profitable_iters = (min_vec_niters * assumed_vf
4317                                 + peel_iters_prologue
4318                                 + peel_iters_epilogue);
4319     }
4320   else
4321     {
4322       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4323                               * assumed_vf
4324                               - vec_inside_cost * peel_iters_prologue
4325                               - vec_inside_cost * peel_iters_epilogue);
4326       if (min_profitable_iters <= 0)
4327         min_profitable_iters = 0;
4328       else
4329         {
4330           min_profitable_iters /= saving_per_viter;
4331
4332           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4333               <= (((int) vec_inside_cost * min_profitable_iters)
4334                   + (((int) vec_outside_cost - scalar_outside_cost)
4335                      * assumed_vf)))
4336             min_profitable_iters++;
4337         }
4338     }
4339
4340   if (dump_enabled_p ())
4341     dump_printf (MSG_NOTE,
4342                  "  Calculated minimum iters for profitability: %d\n",
4343                  min_profitable_iters);
4344
4345   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4346       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4347     /* We want the vectorized loop to execute at least once.  */
4348     min_profitable_iters = assumed_vf + peel_iters_prologue;
4349   else if (min_profitable_iters < peel_iters_prologue)
4350     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4351        vectorized loop executes at least once.  */
4352     min_profitable_iters = peel_iters_prologue;
4353
4354   if (dump_enabled_p ())
4355     dump_printf_loc (MSG_NOTE, vect_location,
4356                      "  Runtime profitability threshold = %d\n",
4357                      min_profitable_iters);
4358
4359   *ret_min_profitable_niters = min_profitable_iters;
4360
4361   /* Calculate number of iterations required to make the vector version
4362      profitable, relative to the loop bodies only.
4363
4364      Non-vectorized variant is SIC * niters and it must win over vector
4365      variant on the expected loop trip count.  The following condition must hold true:
4366      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4367
4368   if (vec_outside_cost <= 0)
4369     min_profitable_estimate = 0;
4370   /* ??? This "else if" arm is written to handle all cases; see below for
4371      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4372   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4373     {
4374       /* This is a repeat of the code above, but with + SOC rather
4375          than - SOC.  */
4376       int outside_overhead = (vec_outside_cost
4377                               - scalar_single_iter_cost * peel_iters_prologue
4378                               - scalar_single_iter_cost * peel_iters_epilogue
4379                               + scalar_outside_cost);
4380       int min_vec_niters = 1;
4381       if (outside_overhead > 0)
4382         min_vec_niters = outside_overhead / saving_per_viter + 1;
4383
4384       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4385         {
4386           int threshold = (vec_inside_cost * min_vec_niters
4387                            + vec_outside_cost
4388                            + scalar_outside_cost);
4389           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4390         }
4391       else
4392         min_profitable_estimate = (min_vec_niters * assumed_vf
4393                                    + peel_iters_prologue
4394                                    + peel_iters_epilogue);
4395     }
4396   else
4397     {
4398       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4399                                  * assumed_vf
4400                                  - vec_inside_cost * peel_iters_prologue
4401                                  - vec_inside_cost * peel_iters_epilogue)
4402                                  / ((scalar_single_iter_cost * assumed_vf)
4403                                    - vec_inside_cost);
4404     }
4405   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4406   if (dump_enabled_p ())
4407     dump_printf_loc (MSG_NOTE, vect_location,
4408                      "  Static estimate profitability threshold = %d\n",
4409                      min_profitable_estimate);
4410
4411   *ret_min_profitable_estimate = min_profitable_estimate;
4412 }
4413
4414 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4415    vector elements (not bits) for a vector with NELT elements.  */
4416 static void
4417 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4418                               vec_perm_builder *sel)
4419 {
4420   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4421      by vec_perm_indices.  */
4422   sel->new_vector (nelt, 1, 3);
4423   for (unsigned int i = 0; i < 3; i++)
4424     sel->quick_push (i + offset);
4425 }
4426
4427 /* Checks whether the target supports whole-vector shifts for vectors of mode
4428    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4429    it supports vec_perm_const with masks for all necessary shift amounts.  */
4430 static bool
4431 have_whole_vector_shift (machine_mode mode)
4432 {
4433   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4434     return true;
4435
4436   /* Variable-length vectors should be handled via the optab.  */
4437   unsigned int nelt;
4438   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4439     return false;
4440
4441   vec_perm_builder sel;
4442   vec_perm_indices indices;
4443   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4444     {
4445       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4446       indices.new_vector (sel, 2, nelt);
4447       if (!can_vec_perm_const_p (mode, indices, false))
4448         return false;
4449     }
4450   return true;
4451 }
4452
4453 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4454    functions. Design better to avoid maintenance issues.  */
4455
4456 /* Function vect_model_reduction_cost.
4457
4458    Models cost for a reduction operation, including the vector ops
4459    generated within the strip-mine loop in some cases, the initial
4460    definition before the loop, and the epilogue code that must be generated.  */
4461
4462 static void
4463 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4464                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4465                            vect_reduction_type reduction_type,
4466                            int ncopies, stmt_vector_for_cost *cost_vec)
4467 {
4468   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4469   enum tree_code code;
4470   optab optab;
4471   tree vectype;
4472   machine_mode mode;
4473   class loop *loop = NULL;
4474
4475   if (loop_vinfo)
4476     loop = LOOP_VINFO_LOOP (loop_vinfo);
4477
4478   /* Condition reductions generate two reductions in the loop.  */
4479   if (reduction_type == COND_REDUCTION)
4480     ncopies *= 2;
4481
4482   vectype = STMT_VINFO_VECTYPE (stmt_info);
4483   mode = TYPE_MODE (vectype);
4484   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4485
4486   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4487
4488   if (reduction_type == EXTRACT_LAST_REDUCTION)
4489     /* No extra instructions are needed in the prologue.  The loop body
4490        operations are costed in vectorizable_condition.  */
4491     inside_cost = 0;
4492   else if (reduction_type == FOLD_LEFT_REDUCTION)
4493     {
4494       /* No extra instructions needed in the prologue.  */
4495       prologue_cost = 0;
4496
4497       if (reduc_fn != IFN_LAST)
4498         /* Count one reduction-like operation per vector.  */
4499         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4500                                         stmt_info, 0, vect_body);
4501       else
4502         {
4503           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4504           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4505           inside_cost = record_stmt_cost (cost_vec, nelements,
4506                                           vec_to_scalar, stmt_info, 0,
4507                                           vect_body);
4508           inside_cost += record_stmt_cost (cost_vec, nelements,
4509                                            scalar_stmt, stmt_info, 0,
4510                                            vect_body);
4511         }
4512     }
4513   else
4514     {
4515       /* Add in cost for initial definition.
4516          For cond reduction we have four vectors: initial index, step,
4517          initial result of the data reduction, initial value of the index
4518          reduction.  */
4519       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4520       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4521                                          scalar_to_vec, stmt_info, 0,
4522                                          vect_prologue);
4523     }
4524
4525   /* Determine cost of epilogue code.
4526
4527      We have a reduction operator that will reduce the vector in one statement.
4528      Also requires scalar extract.  */
4529
4530   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4531     {
4532       if (reduc_fn != IFN_LAST)
4533         {
4534           if (reduction_type == COND_REDUCTION)
4535             {
4536               /* An EQ stmt and an COND_EXPR stmt.  */
4537               epilogue_cost += record_stmt_cost (cost_vec, 2,
4538                                                  vector_stmt, stmt_info, 0,
4539                                                  vect_epilogue);
4540               /* Reduction of the max index and a reduction of the found
4541                  values.  */
4542               epilogue_cost += record_stmt_cost (cost_vec, 2,
4543                                                  vec_to_scalar, stmt_info, 0,
4544                                                  vect_epilogue);
4545               /* A broadcast of the max value.  */
4546               epilogue_cost += record_stmt_cost (cost_vec, 1,
4547                                                  scalar_to_vec, stmt_info, 0,
4548                                                  vect_epilogue);
4549             }
4550           else
4551             {
4552               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4553                                                  stmt_info, 0, vect_epilogue);
4554               epilogue_cost += record_stmt_cost (cost_vec, 1,
4555                                                  vec_to_scalar, stmt_info, 0,
4556                                                  vect_epilogue);
4557             }
4558         }
4559       else if (reduction_type == COND_REDUCTION)
4560         {
4561           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4562           /* Extraction of scalar elements.  */
4563           epilogue_cost += record_stmt_cost (cost_vec,
4564                                              2 * estimated_nunits,
4565                                              vec_to_scalar, stmt_info, 0,
4566                                              vect_epilogue);
4567           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4568           epilogue_cost += record_stmt_cost (cost_vec,
4569                                              2 * estimated_nunits - 3,
4570                                              scalar_stmt, stmt_info, 0,
4571                                              vect_epilogue);
4572         }
4573       else if (reduction_type == EXTRACT_LAST_REDUCTION
4574                || reduction_type == FOLD_LEFT_REDUCTION)
4575         /* No extra instructions need in the epilogue.  */
4576         ;
4577       else
4578         {
4579           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4580           tree bitsize =
4581             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4582           int element_bitsize = tree_to_uhwi (bitsize);
4583           int nelements = vec_size_in_bits / element_bitsize;
4584
4585           if (code == COND_EXPR)
4586             code = MAX_EXPR;
4587
4588           optab = optab_for_tree_code (code, vectype, optab_default);
4589
4590           /* We have a whole vector shift available.  */
4591           if (optab != unknown_optab
4592               && VECTOR_MODE_P (mode)
4593               && optab_handler (optab, mode) != CODE_FOR_nothing
4594               && have_whole_vector_shift (mode))
4595             {
4596               /* Final reduction via vector shifts and the reduction operator.
4597                  Also requires scalar extract.  */
4598               epilogue_cost += record_stmt_cost (cost_vec,
4599                                                  exact_log2 (nelements) * 2,
4600                                                  vector_stmt, stmt_info, 0,
4601                                                  vect_epilogue);
4602               epilogue_cost += record_stmt_cost (cost_vec, 1,
4603                                                  vec_to_scalar, stmt_info, 0,
4604                                                  vect_epilogue);
4605             }
4606           else
4607             /* Use extracts and reduction op for final reduction.  For N
4608                elements, we have N extracts and N-1 reduction ops.  */
4609             epilogue_cost += record_stmt_cost (cost_vec,
4610                                                nelements + nelements - 1,
4611                                                vector_stmt, stmt_info, 0,
4612                                                vect_epilogue);
4613         }
4614     }
4615
4616   if (dump_enabled_p ())
4617     dump_printf (MSG_NOTE,
4618                  "vect_model_reduction_cost: inside_cost = %d, "
4619                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4620                  prologue_cost, epilogue_cost);
4621 }
4622
4623
4624
4625 /* Function get_initial_def_for_reduction
4626
4627    Input:
4628    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4629    INIT_VAL - the initial value of the reduction variable
4630
4631    Output:
4632    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4633         of the reduction (used for adjusting the epilog - see below).
4634    Return a vector variable, initialized according to the operation that
4635         STMT_VINFO performs. This vector will be used as the initial value
4636         of the vector of partial results.
4637
4638    Option1 (adjust in epilog): Initialize the vector as follows:
4639      add/bit or/xor:    [0,0,...,0,0]
4640      mult/bit and:      [1,1,...,1,1]
4641      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4642    and when necessary (e.g. add/mult case) let the caller know
4643    that it needs to adjust the result by init_val.
4644
4645    Option2: Initialize the vector as follows:
4646      add/bit or/xor:    [init_val,0,0,...,0]
4647      mult/bit and:      [init_val,1,1,...,1]
4648      min/max/cond_expr: [init_val,init_val,...,init_val]
4649    and no adjustments are needed.
4650
4651    For example, for the following code:
4652
4653    s = init_val;
4654    for (i=0;i<n;i++)
4655      s = s + a[i];
4656
4657    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4658    For a vector of 4 units, we want to return either [0,0,0,init_val],
4659    or [0,0,0,0] and let the caller know that it needs to adjust
4660    the result at the end by 'init_val'.
4661
4662    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4663    initialization vector is simpler (same element in all entries), if
4664    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4665
4666    A cost model should help decide between these two schemes.  */
4667
4668 static tree
4669 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4670                                stmt_vec_info stmt_vinfo,
4671                                enum tree_code code, tree init_val,
4672                                tree *adjustment_def)
4673 {
4674   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675   tree scalar_type = TREE_TYPE (init_val);
4676   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4677   tree def_for_init;
4678   tree init_def;
4679   REAL_VALUE_TYPE real_init_val = dconst0;
4680   int int_init_val = 0;
4681   gimple_seq stmts = NULL;
4682
4683   gcc_assert (vectype);
4684
4685   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4686               || SCALAR_FLOAT_TYPE_P (scalar_type));
4687
4688   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4689               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4690
4691   /* ADJUSTMENT_DEF is NULL when called from
4692      vect_create_epilog_for_reduction to vectorize double reduction.  */
4693   if (adjustment_def)
4694     *adjustment_def = NULL;
4695
4696   switch (code)
4697     {
4698     case WIDEN_SUM_EXPR:
4699     case DOT_PROD_EXPR:
4700     case SAD_EXPR:
4701     case PLUS_EXPR:
4702     case MINUS_EXPR:
4703     case BIT_IOR_EXPR:
4704     case BIT_XOR_EXPR:
4705     case MULT_EXPR:
4706     case BIT_AND_EXPR:
4707       {
4708         if (code == MULT_EXPR)
4709           {
4710             real_init_val = dconst1;
4711             int_init_val = 1;
4712           }
4713
4714         if (code == BIT_AND_EXPR)
4715           int_init_val = -1;
4716
4717         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4718           def_for_init = build_real (scalar_type, real_init_val);
4719         else
4720           def_for_init = build_int_cst (scalar_type, int_init_val);
4721
4722         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4723           {
4724             /* Option1: the first element is '0' or '1' as well.  */
4725             if (!operand_equal_p (def_for_init, init_val, 0))
4726               *adjustment_def = init_val;
4727             init_def = gimple_build_vector_from_val (&stmts, vectype,
4728                                                      def_for_init);
4729           }
4730         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4731           {
4732             /* Option2 (variable length): the first element is INIT_VAL.  */
4733             init_def = gimple_build_vector_from_val (&stmts, vectype,
4734                                                      def_for_init);
4735             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4736                                      vectype, init_def, init_val);
4737           }
4738         else
4739           {
4740             /* Option2: the first element is INIT_VAL.  */
4741             tree_vector_builder elts (vectype, 1, 2);
4742             elts.quick_push (init_val);
4743             elts.quick_push (def_for_init);
4744             init_def = gimple_build_vector (&stmts, &elts);
4745           }
4746       }
4747       break;
4748
4749     case MIN_EXPR:
4750     case MAX_EXPR:
4751     case COND_EXPR:
4752       {
4753         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4754         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4755       }
4756       break;
4757
4758     default:
4759       gcc_unreachable ();
4760     }
4761
4762   if (stmts)
4763     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4764   return init_def;
4765 }
4766
4767 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4768    NUMBER_OF_VECTORS is the number of vector defs to create.
4769    If NEUTRAL_OP is nonnull, introducing extra elements of that
4770    value will not change the result.  */
4771
4772 static void
4773 get_initial_defs_for_reduction (vec_info *vinfo,
4774                                 slp_tree slp_node,
4775                                 vec<tree> *vec_oprnds,
4776                                 unsigned int number_of_vectors,
4777                                 bool reduc_chain, tree neutral_op)
4778 {
4779   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4780   stmt_vec_info stmt_vinfo = stmts[0];
4781   unsigned HOST_WIDE_INT nunits;
4782   unsigned j, number_of_places_left_in_vector;
4783   tree vector_type;
4784   unsigned int group_size = stmts.length ();
4785   unsigned int i;
4786   class loop *loop;
4787
4788   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4789
4790   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4791
4792   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4793   gcc_assert (loop);
4794   edge pe = loop_preheader_edge (loop);
4795
4796   gcc_assert (!reduc_chain || neutral_op);
4797
4798   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4799      created vectors. It is greater than 1 if unrolling is performed.
4800
4801      For example, we have two scalar operands, s1 and s2 (e.g., group of
4802      strided accesses of size two), while NUNITS is four (i.e., four scalars
4803      of this type can be packed in a vector).  The output vector will contain
4804      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4805      will be 2).
4806
4807      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4808      vectors containing the operands.
4809
4810      For example, NUNITS is four as before, and the group size is 8
4811      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4812      {s5, s6, s7, s8}.  */
4813
4814   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4815     nunits = group_size;
4816
4817   number_of_places_left_in_vector = nunits;
4818   bool constant_p = true;
4819   tree_vector_builder elts (vector_type, nunits, 1);
4820   elts.quick_grow (nunits);
4821   gimple_seq ctor_seq = NULL;
4822   for (j = 0; j < nunits * number_of_vectors; ++j)
4823     {
4824       tree op;
4825       i = j % group_size;
4826       stmt_vinfo = stmts[i];
4827
4828       /* Get the def before the loop.  In reduction chain we have only
4829          one initial value.  Else we have as many as PHIs in the group.  */
4830       if (reduc_chain)
4831         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4832       else if (((vec_oprnds->length () + 1) * nunits
4833                 - number_of_places_left_in_vector >= group_size)
4834                && neutral_op)
4835         op = neutral_op;
4836       else
4837         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4838
4839       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4840       number_of_places_left_in_vector--;
4841       elts[nunits - number_of_places_left_in_vector - 1] = op;
4842       if (!CONSTANT_CLASS_P (op))
4843         constant_p = false;
4844
4845       if (number_of_places_left_in_vector == 0)
4846         {
4847           tree init;
4848           if (constant_p && !neutral_op
4849               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4850               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4851             /* Build the vector directly from ELTS.  */
4852             init = gimple_build_vector (&ctor_seq, &elts);
4853           else if (neutral_op)
4854             {
4855               /* Build a vector of the neutral value and shift the
4856                  other elements into place.  */
4857               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4858                                                    neutral_op);
4859               int k = nunits;
4860               while (k > 0 && elts[k - 1] == neutral_op)
4861                 k -= 1;
4862               while (k > 0)
4863                 {
4864                   k -= 1;
4865                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4866                                        vector_type, init, elts[k]);
4867                 }
4868             }
4869           else
4870             {
4871               /* First time round, duplicate ELTS to fill the
4872                  required number of vectors.  */
4873               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4874                                         number_of_vectors, *vec_oprnds);
4875               break;
4876             }
4877           vec_oprnds->quick_push (init);
4878
4879           number_of_places_left_in_vector = nunits;
4880           elts.new_vector (vector_type, nunits, 1);
4881           elts.quick_grow (nunits);
4882           constant_p = true;
4883         }
4884     }
4885   if (ctor_seq != NULL)
4886     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4887 }
4888
4889 /* For a statement STMT_INFO taking part in a reduction operation return
4890    the stmt_vec_info the meta information is stored on.  */
4891
4892 stmt_vec_info
4893 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4894 {
4895   stmt_info = vect_orig_stmt (stmt_info);
4896   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4897   if (!is_a <gphi *> (stmt_info->stmt)
4898       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4899     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4900   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4901   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4902     {
4903       if (gimple_phi_num_args (phi) == 1)
4904         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4905     }
4906   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4907     {
4908       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4909       stmt_vec_info info
4910           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4911       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4912         stmt_info = info;
4913     }
4914   return stmt_info;
4915 }
4916
4917 /* Function vect_create_epilog_for_reduction
4918
4919    Create code at the loop-epilog to finalize the result of a reduction
4920    computation.
4921
4922    STMT_INFO is the scalar reduction stmt that is being vectorized.
4923    SLP_NODE is an SLP node containing a group of reduction statements. The
4924      first one in this group is STMT_INFO.
4925    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4926    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4927      (counting from 0)
4928
4929    This function:
4930    1. Completes the reduction def-use cycles.
4931    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4932       by calling the function specified by REDUC_FN if available, or by
4933       other means (whole-vector shifts or a scalar loop).
4934       The function also creates a new phi node at the loop exit to preserve
4935       loop-closed form, as illustrated below.
4936
4937      The flow at the entry to this function:
4938
4939         loop:
4940           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4941           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4942           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4943         loop_exit:
4944           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4945           use <s_out0>
4946           use <s_out0>
4947
4948      The above is transformed by this function into:
4949
4950         loop:
4951           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4952           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4953           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4954         loop_exit:
4955           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4956           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4957           v_out2 = reduce <v_out1>
4958           s_out3 = extract_field <v_out2, 0>
4959           s_out4 = adjust_result <s_out3>
4960           use <s_out4>
4961           use <s_out4>
4962 */
4963
4964 static void
4965 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4966                                   stmt_vec_info stmt_info,
4967                                   slp_tree slp_node,
4968                                   slp_instance slp_node_instance)
4969 {
4970   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4971   gcc_assert (reduc_info->is_reduc_info);
4972   /* For double reductions we need to get at the inner loop reduction
4973      stmt which has the meta info attached.  Our stmt_info is that of the
4974      loop-closed PHI of the inner loop which we remember as
4975      def for the reduction PHI generation.  */
4976   bool double_reduc = false;
4977   stmt_vec_info rdef_info = stmt_info;
4978   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4979     {
4980       gcc_assert (!slp_node);
4981       double_reduc = true;
4982       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4983                                             (stmt_info->stmt, 0));
4984       stmt_info = vect_stmt_to_vectorize (stmt_info);
4985     }
4986   gphi *reduc_def_stmt
4987     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4988   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4989   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4990   tree vectype;
4991   machine_mode mode;
4992   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4993   basic_block exit_bb;
4994   tree scalar_dest;
4995   tree scalar_type;
4996   gimple *new_phi = NULL, *phi;
4997   gimple_stmt_iterator exit_gsi;
4998   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4999   gimple *epilog_stmt = NULL;
5000   gimple *exit_phi;
5001   tree bitsize;
5002   tree def;
5003   tree orig_name, scalar_result;
5004   imm_use_iterator imm_iter, phi_imm_iter;
5005   use_operand_p use_p, phi_use_p;
5006   gimple *use_stmt;
5007   bool nested_in_vect_loop = false;
5008   auto_vec<gimple *> new_phis;
5009   int j, i;
5010   auto_vec<tree> scalar_results;
5011   unsigned int group_size = 1, k;
5012   auto_vec<gimple *> phis;
5013   bool slp_reduc = false;
5014   bool direct_slp_reduc;
5015   tree new_phi_result;
5016   tree induction_index = NULL_TREE;
5017
5018   if (slp_node)
5019     group_size = SLP_TREE_LANES (slp_node);
5020
5021   if (nested_in_vect_loop_p (loop, stmt_info))
5022     {
5023       outer_loop = loop;
5024       loop = loop->inner;
5025       nested_in_vect_loop = true;
5026       gcc_assert (!slp_node);
5027     }
5028   gcc_assert (!nested_in_vect_loop || double_reduc);
5029
5030   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5031   gcc_assert (vectype);
5032   mode = TYPE_MODE (vectype);
5033
5034   tree initial_def = NULL;
5035   tree induc_val = NULL_TREE;
5036   tree adjustment_def = NULL;
5037   if (slp_node)
5038     ;
5039   else
5040     {
5041       /* Get at the scalar def before the loop, that defines the initial value
5042          of the reduction variable.  */
5043       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5044                                            loop_preheader_edge (loop));
5045       /* Optimize: for induction condition reduction, if we can't use zero
5046          for induc_val, use initial_def.  */
5047       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5048         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5049       else if (double_reduc)
5050         ;
5051       else if (nested_in_vect_loop)
5052         ;
5053       else
5054         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5055     }
5056
5057   unsigned vec_num;
5058   int ncopies;
5059   if (slp_node)
5060     {
5061       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5062       ncopies = 1;
5063     }
5064   else
5065     {
5066       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5067       vec_num = 1;
5068       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5069     }
5070
5071   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5072      which is updated with the current index of the loop for every match of
5073      the original loop's cond_expr (VEC_STMT).  This results in a vector
5074      containing the last time the condition passed for that vector lane.
5075      The first match will be a 1 to allow 0 to be used for non-matching
5076      indexes.  If there are no matches at all then the vector will be all
5077      zeroes.
5078
5079      PR92772: This algorithm is broken for architectures that support
5080      masked vectors, but do not provide fold_extract_last.  */
5081   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5082     {
5083       auto_vec<std::pair<tree, bool>, 2> ccompares;
5084       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5085       cond_info = vect_stmt_to_vectorize (cond_info);
5086       while (cond_info != reduc_info)
5087         {
5088           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5089             {
5090               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5091               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5092               ccompares.safe_push
5093                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5094                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5095             }
5096           cond_info
5097             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5098                                                  1 + STMT_VINFO_REDUC_IDX
5099                                                         (cond_info)));
5100           cond_info = vect_stmt_to_vectorize (cond_info);
5101         }
5102       gcc_assert (ccompares.length () != 0);
5103
5104       tree indx_before_incr, indx_after_incr;
5105       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5106       int scalar_precision
5107         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5108       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5109       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5110         (TYPE_MODE (vectype), cr_index_scalar_type,
5111          TYPE_VECTOR_SUBPARTS (vectype));
5112
5113       /* First we create a simple vector induction variable which starts
5114          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5115          vector size (STEP).  */
5116
5117       /* Create a {1,2,3,...} vector.  */
5118       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5119
5120       /* Create a vector of the step value.  */
5121       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5122       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5123
5124       /* Create an induction variable.  */
5125       gimple_stmt_iterator incr_gsi;
5126       bool insert_after;
5127       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5128       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5129                  insert_after, &indx_before_incr, &indx_after_incr);
5130
5131       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5132          filled with zeros (VEC_ZERO).  */
5133
5134       /* Create a vector of 0s.  */
5135       tree zero = build_zero_cst (cr_index_scalar_type);
5136       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5137
5138       /* Create a vector phi node.  */
5139       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5140       new_phi = create_phi_node (new_phi_tree, loop->header);
5141       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5142                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5143
5144       /* Now take the condition from the loops original cond_exprs
5145          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5146          every match uses values from the induction variable
5147          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5148          (NEW_PHI_TREE).
5149          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5150          the new cond_expr (INDEX_COND_EXPR).  */
5151       gimple_seq stmts = NULL;
5152       for (int i = ccompares.length () - 1; i != -1; --i)
5153         {
5154           tree ccompare = ccompares[i].first;
5155           if (ccompares[i].second)
5156             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5157                                          cr_index_vector_type,
5158                                          ccompare,
5159                                          indx_before_incr, new_phi_tree);
5160           else
5161             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5162                                          cr_index_vector_type,
5163                                          ccompare,
5164                                          new_phi_tree, indx_before_incr);
5165         }
5166       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5167
5168       /* Update the phi with the vec cond.  */
5169       induction_index = new_phi_tree;
5170       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5171                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5172     }
5173
5174   /* 2. Create epilog code.
5175         The reduction epilog code operates across the elements of the vector
5176         of partial results computed by the vectorized loop.
5177         The reduction epilog code consists of:
5178
5179         step 1: compute the scalar result in a vector (v_out2)
5180         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5181         step 3: adjust the scalar result (s_out3) if needed.
5182
5183         Step 1 can be accomplished using one the following three schemes:
5184           (scheme 1) using reduc_fn, if available.
5185           (scheme 2) using whole-vector shifts, if available.
5186           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5187                      combined.
5188
5189           The overall epilog code looks like this:
5190
5191           s_out0 = phi <s_loop>         # original EXIT_PHI
5192           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5193           v_out2 = reduce <v_out1>              # step 1
5194           s_out3 = extract_field <v_out2, 0>    # step 2
5195           s_out4 = adjust_result <s_out3>       # step 3
5196
5197           (step 3 is optional, and steps 1 and 2 may be combined).
5198           Lastly, the uses of s_out0 are replaced by s_out4.  */
5199
5200
5201   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5202          v_out1 = phi <VECT_DEF>
5203          Store them in NEW_PHIS.  */
5204   if (double_reduc)
5205     loop = outer_loop;
5206   exit_bb = single_exit (loop)->dest;
5207   new_phis.create (slp_node ? vec_num : ncopies);
5208   for (unsigned i = 0; i < vec_num; i++)
5209     {
5210       if (slp_node)
5211         def = vect_get_slp_vect_def (slp_node, i);
5212       else
5213         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5214       for (j = 0; j < ncopies; j++)
5215         {
5216           tree new_def = copy_ssa_name (def);
5217           phi = create_phi_node (new_def, exit_bb);
5218           if (j == 0)
5219             new_phis.quick_push (phi);
5220           else
5221             {
5222               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5223               new_phis.quick_push (phi);
5224             }
5225
5226           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5227         }
5228     }
5229
5230   exit_gsi = gsi_after_labels (exit_bb);
5231
5232   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5233          (i.e. when reduc_fn is not available) and in the final adjustment
5234          code (if needed).  Also get the original scalar reduction variable as
5235          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5236          represents a reduction pattern), the tree-code and scalar-def are
5237          taken from the original stmt that the pattern-stmt (STMT) replaces.
5238          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5239          are taken from STMT.  */
5240
5241   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5242   if (orig_stmt_info != stmt_info)
5243     {
5244       /* Reduction pattern  */
5245       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5246       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5247     }
5248
5249   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5250   scalar_type = TREE_TYPE (scalar_dest);
5251   scalar_results.create (group_size);
5252   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5253   bitsize = TYPE_SIZE (scalar_type);
5254
5255   /* SLP reduction without reduction chain, e.g.,
5256      # a1 = phi <a2, a0>
5257      # b1 = phi <b2, b0>
5258      a2 = operation (a1)
5259      b2 = operation (b1)  */
5260   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5261
5262   /* True if we should implement SLP_REDUC using native reduction operations
5263      instead of scalar operations.  */
5264   direct_slp_reduc = (reduc_fn != IFN_LAST
5265                       && slp_reduc
5266                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5267
5268   /* In case of reduction chain, e.g.,
5269      # a1 = phi <a3, a0>
5270      a2 = operation (a1)
5271      a3 = operation (a2),
5272
5273      we may end up with more than one vector result.  Here we reduce them to
5274      one vector.  */
5275   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5276     {
5277       gimple_seq stmts = NULL;
5278       tree first_vect = PHI_RESULT (new_phis[0]);
5279       first_vect = gimple_convert (&stmts, vectype, first_vect);
5280       for (k = 1; k < new_phis.length (); k++)
5281         {
5282           gimple *next_phi = new_phis[k];
5283           tree second_vect = PHI_RESULT (next_phi);
5284           second_vect = gimple_convert (&stmts, vectype, second_vect);
5285           first_vect = gimple_build (&stmts, code, vectype,
5286                                      first_vect, second_vect);
5287         }
5288       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5289
5290       new_phi_result = first_vect;
5291       new_phis.truncate (0);
5292       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5293     }
5294   /* Likewise if we couldn't use a single defuse cycle.  */
5295   else if (ncopies > 1)
5296     {
5297       gimple_seq stmts = NULL;
5298       tree first_vect = PHI_RESULT (new_phis[0]);
5299       first_vect = gimple_convert (&stmts, vectype, first_vect);
5300       for (int k = 1; k < ncopies; ++k)
5301         {
5302           tree second_vect = PHI_RESULT (new_phis[k]);
5303           second_vect = gimple_convert (&stmts, vectype, second_vect);
5304           first_vect = gimple_build (&stmts, code, vectype,
5305                                      first_vect, second_vect);
5306         }
5307       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5308       new_phi_result = first_vect;
5309       new_phis.truncate (0);
5310       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5311     }
5312   else
5313     new_phi_result = PHI_RESULT (new_phis[0]);
5314
5315   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5316       && reduc_fn != IFN_LAST)
5317     {
5318       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5319          various data values where the condition matched and another vector
5320          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5321          need to extract the last matching index (which will be the index with
5322          highest value) and use this to index into the data vector.
5323          For the case where there were no matches, the data vector will contain
5324          all default values and the index vector will be all zeros.  */
5325
5326       /* Get various versions of the type of the vector of indexes.  */
5327       tree index_vec_type = TREE_TYPE (induction_index);
5328       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5329       tree index_scalar_type = TREE_TYPE (index_vec_type);
5330       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5331
5332       /* Get an unsigned integer version of the type of the data vector.  */
5333       int scalar_precision
5334         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5335       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5336       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5337                                                 vectype);
5338
5339       /* First we need to create a vector (ZERO_VEC) of zeros and another
5340          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5341          can create using a MAX reduction and then expanding.
5342          In the case where the loop never made any matches, the max index will
5343          be zero.  */
5344
5345       /* Vector of {0, 0, 0,...}.  */
5346       tree zero_vec = build_zero_cst (vectype);
5347
5348       gimple_seq stmts = NULL;
5349       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5350       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5351
5352       /* Find maximum value from the vector of found indexes.  */
5353       tree max_index = make_ssa_name (index_scalar_type);
5354       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5355                                                           1, induction_index);
5356       gimple_call_set_lhs (max_index_stmt, max_index);
5357       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5358
5359       /* Vector of {max_index, max_index, max_index,...}.  */
5360       tree max_index_vec = make_ssa_name (index_vec_type);
5361       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5362                                                       max_index);
5363       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5364                                                         max_index_vec_rhs);
5365       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5366
5367       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5368          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5369          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5370          otherwise.  Only one value should match, resulting in a vector
5371          (VEC_COND) with one data value and the rest zeros.
5372          In the case where the loop never made any matches, every index will
5373          match, resulting in a vector with all data values (which will all be
5374          the default value).  */
5375
5376       /* Compare the max index vector to the vector of found indexes to find
5377          the position of the max value.  */
5378       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5379       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5380                                                       induction_index,
5381                                                       max_index_vec);
5382       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5383
5384       /* Use the compare to choose either values from the data vector or
5385          zero.  */
5386       tree vec_cond = make_ssa_name (vectype);
5387       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5388                                                    vec_compare, new_phi_result,
5389                                                    zero_vec);
5390       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5391
5392       /* Finally we need to extract the data value from the vector (VEC_COND)
5393          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5394          reduction, but because this doesn't exist, we can use a MAX reduction
5395          instead.  The data value might be signed or a float so we need to cast
5396          it first.
5397          In the case where the loop never made any matches, the data values are
5398          all identical, and so will reduce down correctly.  */
5399
5400       /* Make the matched data values unsigned.  */
5401       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5402       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5403                                        vec_cond);
5404       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5405                                                         VIEW_CONVERT_EXPR,
5406                                                         vec_cond_cast_rhs);
5407       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5408
5409       /* Reduce down to a scalar value.  */
5410       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5411       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5412                                                            1, vec_cond_cast);
5413       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5414       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5415
5416       /* Convert the reduced value back to the result type and set as the
5417          result.  */
5418       stmts = NULL;
5419       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5420                                data_reduc);
5421       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5422       scalar_results.safe_push (new_temp);
5423     }
5424   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5425            && reduc_fn == IFN_LAST)
5426     {
5427       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5428          idx = 0;
5429          idx_val = induction_index[0];
5430          val = data_reduc[0];
5431          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5432            if (induction_index[i] > idx_val)
5433              val = data_reduc[i], idx_val = induction_index[i];
5434          return val;  */
5435
5436       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5437       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5438       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5439       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5440       /* Enforced by vectorizable_reduction, which ensures we have target
5441          support before allowing a conditional reduction on variable-length
5442          vectors.  */
5443       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5444       tree idx_val = NULL_TREE, val = NULL_TREE;
5445       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5446         {
5447           tree old_idx_val = idx_val;
5448           tree old_val = val;
5449           idx_val = make_ssa_name (idx_eltype);
5450           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5451                                              build3 (BIT_FIELD_REF, idx_eltype,
5452                                                      induction_index,
5453                                                      bitsize_int (el_size),
5454                                                      bitsize_int (off)));
5455           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5456           val = make_ssa_name (data_eltype);
5457           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5458                                              build3 (BIT_FIELD_REF,
5459                                                      data_eltype,
5460                                                      new_phi_result,
5461                                                      bitsize_int (el_size),
5462                                                      bitsize_int (off)));
5463           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464           if (off != 0)
5465             {
5466               tree new_idx_val = idx_val;
5467               if (off != v_size - el_size)
5468                 {
5469                   new_idx_val = make_ssa_name (idx_eltype);
5470                   epilog_stmt = gimple_build_assign (new_idx_val,
5471                                                      MAX_EXPR, idx_val,
5472                                                      old_idx_val);
5473                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5474                 }
5475               tree new_val = make_ssa_name (data_eltype);
5476               epilog_stmt = gimple_build_assign (new_val,
5477                                                  COND_EXPR,
5478                                                  build2 (GT_EXPR,
5479                                                          boolean_type_node,
5480                                                          idx_val,
5481                                                          old_idx_val),
5482                                                  val, old_val);
5483               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5484               idx_val = new_idx_val;
5485               val = new_val;
5486             }
5487         }
5488       /* Convert the reduced value back to the result type and set as the
5489          result.  */
5490       gimple_seq stmts = NULL;
5491       val = gimple_convert (&stmts, scalar_type, val);
5492       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5493       scalar_results.safe_push (val);
5494     }
5495
5496   /* 2.3 Create the reduction code, using one of the three schemes described
5497          above. In SLP we simply need to extract all the elements from the
5498          vector (without reducing them), so we use scalar shifts.  */
5499   else if (reduc_fn != IFN_LAST && !slp_reduc)
5500     {
5501       tree tmp;
5502       tree vec_elem_type;
5503
5504       /* Case 1:  Create:
5505          v_out2 = reduc_expr <v_out1>  */
5506
5507       if (dump_enabled_p ())
5508         dump_printf_loc (MSG_NOTE, vect_location,
5509                          "Reduce using direct vector reduction.\n");
5510
5511       gimple_seq stmts = NULL;
5512       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5513       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5514       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5515                                vec_elem_type, new_phi_result);
5516       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5517       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5518
5519       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5520           && induc_val)
5521         {
5522           /* Earlier we set the initial value to be a vector if induc_val
5523              values.  Check the result and if it is induc_val then replace
5524              with the original initial value, unless induc_val is
5525              the same as initial_def already.  */
5526           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5527                                   induc_val);
5528
5529           tmp = make_ssa_name (new_scalar_dest);
5530           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5531                                              initial_def, new_temp);
5532           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5533           new_temp = tmp;
5534         }
5535
5536       scalar_results.safe_push (new_temp);
5537     }
5538   else if (direct_slp_reduc)
5539     {
5540       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5541          with the elements for other SLP statements replaced with the
5542          neutral value.  We can then do a normal reduction on each vector.  */
5543
5544       /* Enforced by vectorizable_reduction.  */
5545       gcc_assert (new_phis.length () == 1);
5546       gcc_assert (pow2p_hwi (group_size));
5547
5548       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5549       vec<stmt_vec_info> orig_phis
5550         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5551       gimple_seq seq = NULL;
5552
5553       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5554          and the same element size as VECTYPE.  */
5555       tree index = build_index_vector (vectype, 0, 1);
5556       tree index_type = TREE_TYPE (index);
5557       tree index_elt_type = TREE_TYPE (index_type);
5558       tree mask_type = truth_type_for (index_type);
5559
5560       /* Create a vector that, for each element, identifies which of
5561          the REDUC_GROUP_SIZE results should use it.  */
5562       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5563       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5564                             build_vector_from_val (index_type, index_mask));
5565
5566       /* Get a neutral vector value.  This is simply a splat of the neutral
5567          scalar value if we have one, otherwise the initial scalar value
5568          is itself a neutral value.  */
5569       tree vector_identity = NULL_TREE;
5570       tree neutral_op = NULL_TREE;
5571       if (slp_node)
5572         {
5573           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5574           neutral_op
5575             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5576                                             vectype, code, first != NULL);
5577         }
5578       if (neutral_op)
5579         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5580                                                         neutral_op);
5581       for (unsigned int i = 0; i < group_size; ++i)
5582         {
5583           /* If there's no univeral neutral value, we can use the
5584              initial scalar value from the original PHI.  This is used
5585              for MIN and MAX reduction, for example.  */
5586           if (!neutral_op)
5587             {
5588               tree scalar_value
5589                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5590                                          loop_preheader_edge (loop));
5591               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5592                                              scalar_value);
5593               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5594                                                               scalar_value);
5595             }
5596
5597           /* Calculate the equivalent of:
5598
5599              sel[j] = (index[j] == i);
5600
5601              which selects the elements of NEW_PHI_RESULT that should
5602              be included in the result.  */
5603           tree compare_val = build_int_cst (index_elt_type, i);
5604           compare_val = build_vector_from_val (index_type, compare_val);
5605           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5606                                    index, compare_val);
5607
5608           /* Calculate the equivalent of:
5609
5610              vec = seq ? new_phi_result : vector_identity;
5611
5612              VEC is now suitable for a full vector reduction.  */
5613           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5614                                    sel, new_phi_result, vector_identity);
5615
5616           /* Do the reduction and convert it to the appropriate type.  */
5617           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5618                                       TREE_TYPE (vectype), vec);
5619           scalar = gimple_convert (&seq, scalar_type, scalar);
5620           scalar_results.safe_push (scalar);
5621         }
5622       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5623     }
5624   else
5625     {
5626       bool reduce_with_shift;
5627       tree vec_temp;
5628
5629       gcc_assert (slp_reduc || new_phis.length () == 1);
5630
5631       /* See if the target wants to do the final (shift) reduction
5632          in a vector mode of smaller size and first reduce upper/lower
5633          halves against each other.  */
5634       enum machine_mode mode1 = mode;
5635       tree stype = TREE_TYPE (vectype);
5636       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5637       unsigned nunits1 = nunits;
5638       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5639           && new_phis.length () == 1)
5640         {
5641           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5642           /* For SLP reductions we have to make sure lanes match up, but
5643              since we're doing individual element final reduction reducing
5644              vector width here is even more important.
5645              ???  We can also separate lanes with permutes, for the common
5646              case of power-of-two group-size odd/even extracts would work.  */
5647           if (slp_reduc && nunits != nunits1)
5648             {
5649               nunits1 = least_common_multiple (nunits1, group_size);
5650               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5651             }
5652         }
5653       if (!slp_reduc
5654           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5655         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5656
5657       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5658                                                            stype, nunits1);
5659       reduce_with_shift = have_whole_vector_shift (mode1);
5660       if (!VECTOR_MODE_P (mode1))
5661         reduce_with_shift = false;
5662       else
5663         {
5664           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5665           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5666             reduce_with_shift = false;
5667         }
5668
5669       /* First reduce the vector to the desired vector size we should
5670          do shift reduction on by combining upper and lower halves.  */
5671       new_temp = new_phi_result;
5672       while (nunits > nunits1)
5673         {
5674           nunits /= 2;
5675           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5676                                                           stype, nunits);
5677           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5678
5679           /* The target has to make sure we support lowpart/highpart
5680              extraction, either via direct vector extract or through
5681              an integer mode punning.  */
5682           tree dst1, dst2;
5683           if (convert_optab_handler (vec_extract_optab,
5684                                      TYPE_MODE (TREE_TYPE (new_temp)),
5685                                      TYPE_MODE (vectype1))
5686               != CODE_FOR_nothing)
5687             {
5688               /* Extract sub-vectors directly once vec_extract becomes
5689                  a conversion optab.  */
5690               dst1 = make_ssa_name (vectype1);
5691               epilog_stmt
5692                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5693                                          build3 (BIT_FIELD_REF, vectype1,
5694                                                  new_temp, TYPE_SIZE (vectype1),
5695                                                  bitsize_int (0)));
5696               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5697               dst2 =  make_ssa_name (vectype1);
5698               epilog_stmt
5699                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5700                                          build3 (BIT_FIELD_REF, vectype1,
5701                                                  new_temp, TYPE_SIZE (vectype1),
5702                                                  bitsize_int (bitsize)));
5703               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5704             }
5705           else
5706             {
5707               /* Extract via punning to appropriately sized integer mode
5708                  vector.  */
5709               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5710               tree etype = build_vector_type (eltype, 2);
5711               gcc_assert (convert_optab_handler (vec_extract_optab,
5712                                                  TYPE_MODE (etype),
5713                                                  TYPE_MODE (eltype))
5714                           != CODE_FOR_nothing);
5715               tree tem = make_ssa_name (etype);
5716               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5717                                                  build1 (VIEW_CONVERT_EXPR,
5718                                                          etype, new_temp));
5719               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5720               new_temp = tem;
5721               tem = make_ssa_name (eltype);
5722               epilog_stmt
5723                   = gimple_build_assign (tem, BIT_FIELD_REF,
5724                                          build3 (BIT_FIELD_REF, eltype,
5725                                                  new_temp, TYPE_SIZE (eltype),
5726                                                  bitsize_int (0)));
5727               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5728               dst1 = make_ssa_name (vectype1);
5729               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5730                                                  build1 (VIEW_CONVERT_EXPR,
5731                                                          vectype1, tem));
5732               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5733               tem = make_ssa_name (eltype);
5734               epilog_stmt
5735                   = gimple_build_assign (tem, BIT_FIELD_REF,
5736                                          build3 (BIT_FIELD_REF, eltype,
5737                                                  new_temp, TYPE_SIZE (eltype),
5738                                                  bitsize_int (bitsize)));
5739               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5740               dst2 =  make_ssa_name (vectype1);
5741               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5742                                                  build1 (VIEW_CONVERT_EXPR,
5743                                                          vectype1, tem));
5744               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5745             }
5746
5747           new_temp = make_ssa_name (vectype1);
5748           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5749           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750           new_phis[0] = epilog_stmt;
5751         }
5752
5753       if (reduce_with_shift && !slp_reduc)
5754         {
5755           int element_bitsize = tree_to_uhwi (bitsize);
5756           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5757              for variable-length vectors and also requires direct target support
5758              for loop reductions.  */
5759           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5760           int nelements = vec_size_in_bits / element_bitsize;
5761           vec_perm_builder sel;
5762           vec_perm_indices indices;
5763
5764           int elt_offset;
5765
5766           tree zero_vec = build_zero_cst (vectype1);
5767           /* Case 2: Create:
5768              for (offset = nelements/2; offset >= 1; offset/=2)
5769                 {
5770                   Create:  va' = vec_shift <va, offset>
5771                   Create:  va = vop <va, va'>
5772                 }  */
5773
5774           tree rhs;
5775
5776           if (dump_enabled_p ())
5777             dump_printf_loc (MSG_NOTE, vect_location,
5778                              "Reduce using vector shifts\n");
5779
5780           gimple_seq stmts = NULL;
5781           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5782           for (elt_offset = nelements / 2;
5783                elt_offset >= 1;
5784                elt_offset /= 2)
5785             {
5786               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5787               indices.new_vector (sel, 2, nelements);
5788               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5789               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5790                                        new_temp, zero_vec, mask);
5791               new_temp = gimple_build (&stmts, code,
5792                                        vectype1, new_name, new_temp);
5793             }
5794           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5795
5796           /* 2.4  Extract the final scalar result.  Create:
5797              s_out3 = extract_field <v_out2, bitpos>  */
5798
5799           if (dump_enabled_p ())
5800             dump_printf_loc (MSG_NOTE, vect_location,
5801                              "extract scalar result\n");
5802
5803           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5804                         bitsize, bitsize_zero_node);
5805           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5806           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5807           gimple_assign_set_lhs (epilog_stmt, new_temp);
5808           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5809           scalar_results.safe_push (new_temp);
5810         }
5811       else
5812         {
5813           /* Case 3: Create:
5814              s = extract_field <v_out2, 0>
5815              for (offset = element_size;
5816                   offset < vector_size;
5817                   offset += element_size;)
5818                {
5819                  Create:  s' = extract_field <v_out2, offset>
5820                  Create:  s = op <s, s'>  // For non SLP cases
5821                }  */
5822
5823           if (dump_enabled_p ())
5824             dump_printf_loc (MSG_NOTE, vect_location,
5825                              "Reduce using scalar code.\n");
5826
5827           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5828           int element_bitsize = tree_to_uhwi (bitsize);
5829           tree compute_type = TREE_TYPE (vectype);
5830           gimple_seq stmts = NULL;
5831           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5832             {
5833               int bit_offset;
5834               if (gimple_code (new_phi) == GIMPLE_PHI)
5835                 vec_temp = PHI_RESULT (new_phi);
5836               else
5837                 vec_temp = gimple_assign_lhs (new_phi);
5838               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5839                                        vec_temp, bitsize, bitsize_zero_node);
5840
5841               /* In SLP we don't need to apply reduction operation, so we just
5842                  collect s' values in SCALAR_RESULTS.  */
5843               if (slp_reduc)
5844                 scalar_results.safe_push (new_temp);
5845
5846               for (bit_offset = element_bitsize;
5847                    bit_offset < vec_size_in_bits;
5848                    bit_offset += element_bitsize)
5849                 {
5850                   tree bitpos = bitsize_int (bit_offset);
5851                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5852                                            compute_type, vec_temp,
5853                                            bitsize, bitpos);
5854                   if (slp_reduc)
5855                     {
5856                       /* In SLP we don't need to apply reduction operation, so
5857                          we just collect s' values in SCALAR_RESULTS.  */
5858                       new_temp = new_name;
5859                       scalar_results.safe_push (new_name);
5860                     }
5861                   else
5862                     new_temp = gimple_build (&stmts, code, compute_type,
5863                                              new_name, new_temp);
5864                 }
5865             }
5866
5867           /* The only case where we need to reduce scalar results in SLP, is
5868              unrolling.  If the size of SCALAR_RESULTS is greater than
5869              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5870              REDUC_GROUP_SIZE.  */
5871           if (slp_reduc)
5872             {
5873               tree res, first_res, new_res;
5874
5875               /* Reduce multiple scalar results in case of SLP unrolling.  */
5876               for (j = group_size; scalar_results.iterate (j, &res);
5877                    j++)
5878                 {
5879                   first_res = scalar_results[j % group_size];
5880                   new_res = gimple_build (&stmts, code, compute_type,
5881                                           first_res, res);
5882                   scalar_results[j % group_size] = new_res;
5883                 }
5884               for (k = 0; k < group_size; k++)
5885                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5886                                                     scalar_results[k]);
5887             }
5888           else
5889             {
5890               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5891               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5892               scalar_results.safe_push (new_temp);
5893             }
5894
5895           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5896         }
5897
5898       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5899           && induc_val)
5900         {
5901           /* Earlier we set the initial value to be a vector if induc_val
5902              values.  Check the result and if it is induc_val then replace
5903              with the original initial value, unless induc_val is
5904              the same as initial_def already.  */
5905           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5906                                   induc_val);
5907
5908           tree tmp = make_ssa_name (new_scalar_dest);
5909           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5910                                              initial_def, new_temp);
5911           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5912           scalar_results[0] = tmp;
5913         }
5914     }
5915
5916   /* 2.5 Adjust the final result by the initial value of the reduction
5917          variable. (When such adjustment is not needed, then
5918          'adjustment_def' is zero).  For example, if code is PLUS we create:
5919          new_temp = loop_exit_def + adjustment_def  */
5920
5921   if (adjustment_def)
5922     {
5923       gcc_assert (!slp_reduc);
5924       gimple_seq stmts = NULL;
5925       if (nested_in_vect_loop)
5926         {
5927           new_phi = new_phis[0];
5928           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5929           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5930           new_temp = gimple_build (&stmts, code, vectype,
5931                                    PHI_RESULT (new_phi), adjustment_def);
5932         }
5933       else
5934         {
5935           new_temp = scalar_results[0];
5936           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5937           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5938           new_temp = gimple_build (&stmts, code, scalar_type,
5939                                    new_temp, adjustment_def);
5940         }
5941
5942       epilog_stmt = gimple_seq_last_stmt (stmts);
5943       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5944       if (nested_in_vect_loop)
5945         {
5946           if (!double_reduc)
5947             scalar_results.quick_push (new_temp);
5948           else
5949             scalar_results[0] = new_temp;
5950         }
5951       else
5952         scalar_results[0] = new_temp;
5953
5954       new_phis[0] = epilog_stmt;
5955     }
5956
5957   if (double_reduc)
5958     loop = loop->inner;
5959
5960   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5961           phis with new adjusted scalar results, i.e., replace use <s_out0>
5962           with use <s_out4>.
5963
5964      Transform:
5965         loop_exit:
5966           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5967           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5968           v_out2 = reduce <v_out1>
5969           s_out3 = extract_field <v_out2, 0>
5970           s_out4 = adjust_result <s_out3>
5971           use <s_out0>
5972           use <s_out0>
5973
5974      into:
5975
5976         loop_exit:
5977           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5978           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5979           v_out2 = reduce <v_out1>
5980           s_out3 = extract_field <v_out2, 0>
5981           s_out4 = adjust_result <s_out3>
5982           use <s_out4>
5983           use <s_out4> */
5984
5985
5986   /* In SLP reduction chain we reduce vector results into one vector if
5987      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5988      LHS of the last stmt in the reduction chain, since we are looking for
5989      the loop exit phi node.  */
5990   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5991     {
5992       stmt_vec_info dest_stmt_info
5993         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5994       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5995       group_size = 1;
5996     }
5997
5998   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5999      case that REDUC_GROUP_SIZE is greater than vectorization factor).
6000      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
6001      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
6002      correspond to the first vector stmt, etc.
6003      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
6004   if (group_size > new_phis.length ())
6005     gcc_assert (!(group_size % new_phis.length ()));
6006
6007   for (k = 0; k < group_size; k++)
6008     {
6009       if (slp_reduc)
6010         {
6011           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6012
6013           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
6014           /* SLP statements can't participate in patterns.  */
6015           gcc_assert (!orig_stmt_info);
6016           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6017         }
6018
6019       if (nested_in_vect_loop)
6020         {
6021           if (double_reduc)
6022             loop = outer_loop;
6023           else
6024             gcc_unreachable ();
6025         }
6026
6027       phis.create (3);
6028       /* Find the loop-closed-use at the loop exit of the original scalar
6029          result.  (The reduction result is expected to have two immediate uses,
6030          one at the latch block, and one at the loop exit).  For double
6031          reductions we are looking for exit phis of the outer loop.  */
6032       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6033         {
6034           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6035             {
6036               if (!is_gimple_debug (USE_STMT (use_p)))
6037                 phis.safe_push (USE_STMT (use_p));
6038             }
6039           else
6040             {
6041               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6042                 {
6043                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6044
6045                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6046                     {
6047                       if (!flow_bb_inside_loop_p (loop,
6048                                              gimple_bb (USE_STMT (phi_use_p)))
6049                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6050                         phis.safe_push (USE_STMT (phi_use_p));
6051                     }
6052                 }
6053             }
6054         }
6055
6056       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6057         {
6058           /* Replace the uses:  */
6059           orig_name = PHI_RESULT (exit_phi);
6060           scalar_result = scalar_results[k];
6061           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6062             {
6063               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6064                 SET_USE (use_p, scalar_result);
6065               update_stmt (use_stmt);
6066             }
6067         }
6068
6069       phis.release ();
6070     }
6071 }
6072
6073 /* Return a vector of type VECTYPE that is equal to the vector select
6074    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6075    before GSI.  */
6076
6077 static tree
6078 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6079                      tree vec, tree identity)
6080 {
6081   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6082   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6083                                           mask, vec, identity);
6084   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6085   return cond;
6086 }
6087
6088 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6089    order, starting with LHS.  Insert the extraction statements before GSI and
6090    associate the new scalar SSA names with variable SCALAR_DEST.
6091    Return the SSA name for the result.  */
6092
6093 static tree
6094 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6095                        tree_code code, tree lhs, tree vector_rhs)
6096 {
6097   tree vectype = TREE_TYPE (vector_rhs);
6098   tree scalar_type = TREE_TYPE (vectype);
6099   tree bitsize = TYPE_SIZE (scalar_type);
6100   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6101   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6102
6103   for (unsigned HOST_WIDE_INT bit_offset = 0;
6104        bit_offset < vec_size_in_bits;
6105        bit_offset += element_bitsize)
6106     {
6107       tree bitpos = bitsize_int (bit_offset);
6108       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6109                          bitsize, bitpos);
6110
6111       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6112       rhs = make_ssa_name (scalar_dest, stmt);
6113       gimple_assign_set_lhs (stmt, rhs);
6114       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6115
6116       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6117       tree new_name = make_ssa_name (scalar_dest, stmt);
6118       gimple_assign_set_lhs (stmt, new_name);
6119       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6120       lhs = new_name;
6121     }
6122   return lhs;
6123 }
6124
6125 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6126    type of the vector input.  */
6127
6128 static internal_fn
6129 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6130 {
6131   internal_fn mask_reduc_fn;
6132
6133   switch (reduc_fn)
6134     {
6135     case IFN_FOLD_LEFT_PLUS:
6136       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6137       break;
6138
6139     default:
6140       return IFN_LAST;
6141     }
6142
6143   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6144                                       OPTIMIZE_FOR_SPEED))
6145     return mask_reduc_fn;
6146   return IFN_LAST;
6147 }
6148
6149 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6150    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6151    statement.  CODE is the operation performed by STMT_INFO and OPS are
6152    its scalar operands.  REDUC_INDEX is the index of the operand in
6153    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6154    implements in-order reduction, or IFN_LAST if we should open-code it.
6155    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6156    that should be used to control the operation in a fully-masked loop.  */
6157
6158 static bool
6159 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6160                                stmt_vec_info stmt_info,
6161                                gimple_stmt_iterator *gsi,
6162                                gimple **vec_stmt, slp_tree slp_node,
6163                                gimple *reduc_def_stmt,
6164                                tree_code code, internal_fn reduc_fn,
6165                                tree ops[3], tree vectype_in,
6166                                int reduc_index, vec_loop_masks *masks)
6167 {
6168   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6169   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6170   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6171
6172   int ncopies;
6173   if (slp_node)
6174     ncopies = 1;
6175   else
6176     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6177
6178   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6179   gcc_assert (ncopies == 1);
6180   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6181
6182   if (slp_node)
6183     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6184                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6185
6186   tree op0 = ops[1 - reduc_index];
6187
6188   int group_size = 1;
6189   stmt_vec_info scalar_dest_def_info;
6190   auto_vec<tree> vec_oprnds0;
6191   if (slp_node)
6192     {
6193       auto_vec<vec<tree> > vec_defs (2);
6194       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6195       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6196       vec_defs[0].release ();
6197       vec_defs[1].release ();
6198       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6199       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6200     }
6201   else
6202     {
6203       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6204                                      op0, &vec_oprnds0);
6205       scalar_dest_def_info = stmt_info;
6206     }
6207
6208   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6209   tree scalar_type = TREE_TYPE (scalar_dest);
6210   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6211
6212   int vec_num = vec_oprnds0.length ();
6213   gcc_assert (vec_num == 1 || slp_node);
6214   tree vec_elem_type = TREE_TYPE (vectype_out);
6215   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6216
6217   tree vector_identity = NULL_TREE;
6218   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6219     vector_identity = build_zero_cst (vectype_out);
6220
6221   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6222   int i;
6223   tree def0;
6224   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6225     {
6226       gimple *new_stmt;
6227       tree mask = NULL_TREE;
6228       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6229         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6230
6231       /* Handle MINUS by adding the negative.  */
6232       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6233         {
6234           tree negated = make_ssa_name (vectype_out);
6235           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6236           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6237           def0 = negated;
6238         }
6239
6240       if (mask && mask_reduc_fn == IFN_LAST)
6241         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6242                                     vector_identity);
6243
6244       /* On the first iteration the input is simply the scalar phi
6245          result, and for subsequent iterations it is the output of
6246          the preceding operation.  */
6247       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6248         {
6249           if (mask && mask_reduc_fn != IFN_LAST)
6250             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6251                                                    def0, mask);
6252           else
6253             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6254                                                    def0);
6255           /* For chained SLP reductions the output of the previous reduction
6256              operation serves as the input of the next. For the final statement
6257              the output cannot be a temporary - we reuse the original
6258              scalar destination of the last statement.  */
6259           if (i != vec_num - 1)
6260             {
6261               gimple_set_lhs (new_stmt, scalar_dest_var);
6262               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6263               gimple_set_lhs (new_stmt, reduc_var);
6264             }
6265         }
6266       else
6267         {
6268           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6269                                              reduc_var, def0);
6270           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6271           /* Remove the statement, so that we can use the same code paths
6272              as for statements that we've just created.  */
6273           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6274           gsi_remove (&tmp_gsi, true);
6275         }
6276
6277       if (i == vec_num - 1)
6278         {
6279           gimple_set_lhs (new_stmt, scalar_dest);
6280           vect_finish_replace_stmt (loop_vinfo,
6281                                     scalar_dest_def_info,
6282                                     new_stmt);
6283         }
6284       else
6285         vect_finish_stmt_generation (loop_vinfo,
6286                                      scalar_dest_def_info,
6287                                      new_stmt, gsi);
6288
6289       if (slp_node)
6290         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6291       else
6292         {
6293           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6294           *vec_stmt = new_stmt;
6295         }
6296     }
6297
6298   return true;
6299 }
6300
6301 /* Function is_nonwrapping_integer_induction.
6302
6303    Check if STMT_VINO (which is part of loop LOOP) both increments and
6304    does not cause overflow.  */
6305
6306 static bool
6307 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6308 {
6309   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6310   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6311   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6312   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6313   widest_int ni, max_loop_value, lhs_max;
6314   wi::overflow_type overflow = wi::OVF_NONE;
6315
6316   /* Make sure the loop is integer based.  */
6317   if (TREE_CODE (base) != INTEGER_CST
6318       || TREE_CODE (step) != INTEGER_CST)
6319     return false;
6320
6321   /* Check that the max size of the loop will not wrap.  */
6322
6323   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6324     return true;
6325
6326   if (! max_stmt_executions (loop, &ni))
6327     return false;
6328
6329   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6330                             &overflow);
6331   if (overflow)
6332     return false;
6333
6334   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6335                             TYPE_SIGN (lhs_type), &overflow);
6336   if (overflow)
6337     return false;
6338
6339   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6340           <= TYPE_PRECISION (lhs_type));
6341 }
6342
6343 /* Check if masking can be supported by inserting a conditional expression.
6344    CODE is the code for the operation.  COND_FN is the conditional internal
6345    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6346 static bool
6347 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6348                          tree vectype_in)
6349 {
6350   if (cond_fn != IFN_LAST
6351       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6352                                          OPTIMIZE_FOR_SPEED))
6353     return false;
6354
6355   switch (code)
6356     {
6357     case DOT_PROD_EXPR:
6358     case SAD_EXPR:
6359       return true;
6360
6361     default:
6362       return false;
6363     }
6364 }
6365
6366 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6367    code for the operation.  VOP is the array of operands.  MASK is the loop
6368    mask.  GSI is a statement iterator used to place the new conditional
6369    expression.  */
6370 static void
6371 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6372                       gimple_stmt_iterator *gsi)
6373 {
6374   switch (code)
6375     {
6376     case DOT_PROD_EXPR:
6377       {
6378         tree vectype = TREE_TYPE (vop[1]);
6379         tree zero = build_zero_cst (vectype);
6380         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6381         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6382                                                mask, vop[1], zero);
6383         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6384         vop[1] = masked_op1;
6385         break;
6386       }
6387
6388     case SAD_EXPR:
6389       {
6390         tree vectype = TREE_TYPE (vop[1]);
6391         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6392         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6393                                                mask, vop[1], vop[0]);
6394         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6395         vop[1] = masked_op1;
6396         break;
6397       }
6398
6399     default:
6400       gcc_unreachable ();
6401     }
6402 }
6403
6404 /* Function vectorizable_reduction.
6405
6406    Check if STMT_INFO performs a reduction operation that can be vectorized.
6407    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6408    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6409    Return true if STMT_INFO is vectorizable in this way.
6410
6411    This function also handles reduction idioms (patterns) that have been
6412    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6413    may be of this form:
6414      X = pattern_expr (arg0, arg1, ..., X)
6415    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6416    sequence that had been detected and replaced by the pattern-stmt
6417    (STMT_INFO).
6418
6419    This function also handles reduction of condition expressions, for example:
6420      for (int i = 0; i < N; i++)
6421        if (a[i] < value)
6422          last = a[i];
6423    This is handled by vectorising the loop and creating an additional vector
6424    containing the loop indexes for which "a[i] < value" was true.  In the
6425    function epilogue this is reduced to a single max value and then used to
6426    index into the vector of results.
6427
6428    In some cases of reduction patterns, the type of the reduction variable X is
6429    different than the type of the other arguments of STMT_INFO.
6430    In such cases, the vectype that is used when transforming STMT_INFO into
6431    a vector stmt is different than the vectype that is used to determine the
6432    vectorization factor, because it consists of a different number of elements
6433    than the actual number of elements that are being operated upon in parallel.
6434
6435    For example, consider an accumulation of shorts into an int accumulator.
6436    On some targets it's possible to vectorize this pattern operating on 8
6437    shorts at a time (hence, the vectype for purposes of determining the
6438    vectorization factor should be V8HI); on the other hand, the vectype that
6439    is used to create the vector form is actually V4SI (the type of the result).
6440
6441    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6442    indicates what is the actual level of parallelism (V8HI in the example), so
6443    that the right vectorization factor would be derived.  This vectype
6444    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6445    be used to create the vectorized stmt.  The right vectype for the vectorized
6446    stmt is obtained from the type of the result X:
6447       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6448
6449    This means that, contrary to "regular" reductions (or "regular" stmts in
6450    general), the following equation:
6451       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6452    does *NOT* necessarily hold for reduction patterns.  */
6453
6454 bool
6455 vectorizable_reduction (loop_vec_info loop_vinfo,
6456                         stmt_vec_info stmt_info, slp_tree slp_node,
6457                         slp_instance slp_node_instance,
6458                         stmt_vector_for_cost *cost_vec)
6459 {
6460   tree scalar_dest;
6461   tree vectype_in = NULL_TREE;
6462   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6463   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6464   stmt_vec_info cond_stmt_vinfo = NULL;
6465   tree scalar_type;
6466   int i;
6467   int ncopies;
6468   bool single_defuse_cycle = false;
6469   bool nested_cycle = false;
6470   bool double_reduc = false;
6471   int vec_num;
6472   tree tem;
6473   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6474   tree cond_reduc_val = NULL_TREE;
6475
6476   /* Make sure it was already recognized as a reduction computation.  */
6477   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6478       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6479       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6480     return false;
6481
6482   /* The stmt we store reduction analysis meta on.  */
6483   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6484   reduc_info->is_reduc_info = true;
6485
6486   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6487     {
6488       if (is_a <gphi *> (stmt_info->stmt))
6489         {
6490           if (slp_node)
6491             {
6492               /* We eventually need to set a vector type on invariant
6493                  arguments.  */
6494               unsigned j;
6495               slp_tree child;
6496               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6497                 if (!vect_maybe_update_slp_op_vectype
6498                        (child, SLP_TREE_VECTYPE (slp_node)))
6499                   {
6500                     if (dump_enabled_p ())
6501                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502                                        "incompatible vector types for "
6503                                        "invariants\n");
6504                     return false;
6505                   }
6506             }
6507           /* Analysis for double-reduction is done on the outer
6508              loop PHI, nested cycles have no further restrictions.  */
6509           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6510         }
6511       else
6512         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6513       return true;
6514     }
6515
6516   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6517   stmt_vec_info phi_info = stmt_info;
6518   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6519       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6520     {
6521       if (!is_a <gphi *> (stmt_info->stmt))
6522         {
6523           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6524           return true;
6525         }
6526       if (slp_node)
6527         {
6528           slp_node_instance->reduc_phis = slp_node;
6529           /* ???  We're leaving slp_node to point to the PHIs, we only
6530              need it to get at the number of vector stmts which wasn't
6531              yet initialized for the instance root.  */
6532         }
6533       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6534         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6535       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6536         {
6537           use_operand_p use_p;
6538           gimple *use_stmt;
6539           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6540                                      &use_p, &use_stmt);
6541           gcc_assert (res);
6542           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6543           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6544         }
6545     }
6546
6547   /* PHIs should not participate in patterns.  */
6548   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6549   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6550
6551   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6552      and compute the reduction chain length.  Discover the real
6553      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6554   tree reduc_def
6555     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6556                              loop_latch_edge
6557                                (gimple_bb (reduc_def_phi)->loop_father));
6558   unsigned reduc_chain_length = 0;
6559   bool only_slp_reduc_chain = true;
6560   stmt_info = NULL;
6561   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6562   while (reduc_def != PHI_RESULT (reduc_def_phi))
6563     {
6564       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6565       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6566       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6567         {
6568           if (dump_enabled_p ())
6569             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6570                              "reduction chain broken by patterns.\n");
6571           return false;
6572         }
6573       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6574         only_slp_reduc_chain = false;
6575       /* ???  For epilogue generation live members of the chain need
6576          to point back to the PHI via their original stmt for
6577          info_for_reduction to work.  */
6578       if (STMT_VINFO_LIVE_P (vdef))
6579         STMT_VINFO_REDUC_DEF (def) = phi_info;
6580       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6581       if (!assign)
6582         {
6583           if (dump_enabled_p ())
6584             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585                              "reduction chain includes calls.\n");
6586           return false;
6587         }
6588       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6589         {
6590           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6591                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6592             {
6593               if (dump_enabled_p ())
6594                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595                                  "conversion in the reduction chain.\n");
6596               return false;
6597             }
6598         }
6599       else if (!stmt_info)
6600         /* First non-conversion stmt.  */
6601         stmt_info = vdef;
6602       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6603       reduc_chain_length++;
6604       if (!stmt_info && slp_node)
6605         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6606     }
6607   /* PHIs should not participate in patterns.  */
6608   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6609
6610   if (nested_in_vect_loop_p (loop, stmt_info))
6611     {
6612       loop = loop->inner;
6613       nested_cycle = true;
6614     }
6615
6616   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6617      element.  */
6618   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6619     {
6620       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6621       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6622     }
6623   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6624     gcc_assert (slp_node
6625                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6626
6627   /* 1. Is vectorizable reduction?  */
6628   /* Not supportable if the reduction variable is used in the loop, unless
6629      it's a reduction chain.  */
6630   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6631       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6632     return false;
6633
6634   /* Reductions that are not used even in an enclosing outer-loop,
6635      are expected to be "live" (used out of the loop).  */
6636   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6637       && !STMT_VINFO_LIVE_P (stmt_info))
6638     return false;
6639
6640   /* 2. Has this been recognized as a reduction pattern?
6641
6642      Check if STMT represents a pattern that has been recognized
6643      in earlier analysis stages.  For stmts that represent a pattern,
6644      the STMT_VINFO_RELATED_STMT field records the last stmt in
6645      the original sequence that constitutes the pattern.  */
6646
6647   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6648   if (orig_stmt_info)
6649     {
6650       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6651       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6652     }
6653
6654   /* 3. Check the operands of the operation.  The first operands are defined
6655         inside the loop body. The last operand is the reduction variable,
6656         which is defined by the loop-header-phi.  */
6657
6658   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6659   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6660   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6661   enum tree_code code = gimple_assign_rhs_code (stmt);
6662   bool lane_reduc_code_p
6663     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6664   int op_type = TREE_CODE_LENGTH (code);
6665
6666   scalar_dest = gimple_assign_lhs (stmt);
6667   scalar_type = TREE_TYPE (scalar_dest);
6668   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6669       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6670     return false;
6671
6672   /* Do not try to vectorize bit-precision reductions.  */
6673   if (!type_has_mode_precision_p (scalar_type))
6674     return false;
6675
6676   /* For lane-reducing ops we're reducing the number of reduction PHIs
6677      which means the only use of that may be in the lane-reducing operation.  */
6678   if (lane_reduc_code_p
6679       && reduc_chain_length != 1
6680       && !only_slp_reduc_chain)
6681     {
6682       if (dump_enabled_p ())
6683         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684                          "lane-reducing reduction with extra stmts.\n");
6685       return false;
6686     }
6687
6688   /* All uses but the last are expected to be defined in the loop.
6689      The last use is the reduction variable.  In case of nested cycle this
6690      assumption is not true: we use reduc_index to record the index of the
6691      reduction variable.  */
6692   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6693   /* We need to skip an extra operand for COND_EXPRs with embedded
6694      comparison.  */
6695   unsigned opno_adjust = 0;
6696   if (code == COND_EXPR
6697       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6698     opno_adjust = 1;
6699   for (i = 0; i < op_type; i++)
6700     {
6701       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6702       if (i == 0 && code == COND_EXPR)
6703         continue;
6704
6705       stmt_vec_info def_stmt_info;
6706       enum vect_def_type dt;
6707       tree op;
6708       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6709                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6710                                &def_stmt_info))
6711         {
6712           if (dump_enabled_p ())
6713             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714                              "use not simple.\n");
6715           return false;
6716         }
6717       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6718         continue;
6719
6720       /* There should be only one cycle def in the stmt, the one
6721          leading to reduc_def.  */
6722       if (VECTORIZABLE_CYCLE_DEF (dt))
6723         return false;
6724
6725       /* To properly compute ncopies we are interested in the widest
6726          non-reduction input type in case we're looking at a widening
6727          accumulation that we later handle in vect_transform_reduction.  */
6728       if (lane_reduc_code_p
6729           && tem
6730           && (!vectype_in
6731               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6732                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6733         vectype_in = tem;
6734
6735       if (code == COND_EXPR)
6736         {
6737           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6738           if (dt == vect_constant_def)
6739             {
6740               cond_reduc_dt = dt;
6741               cond_reduc_val = op;
6742             }
6743           if (dt == vect_induction_def
6744               && def_stmt_info
6745               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6746             {
6747               cond_reduc_dt = dt;
6748               cond_stmt_vinfo = def_stmt_info;
6749             }
6750         }
6751     }
6752   if (!vectype_in)
6753     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6754   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6755
6756   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6757   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6758   /* If we have a condition reduction, see if we can simplify it further.  */
6759   if (v_reduc_type == COND_REDUCTION)
6760     {
6761       if (slp_node)
6762         return false;
6763
6764       /* When the condition uses the reduction value in the condition, fail.  */
6765       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6766         {
6767           if (dump_enabled_p ())
6768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769                              "condition depends on previous iteration\n");
6770           return false;
6771         }
6772
6773       if (reduc_chain_length == 1
6774           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6775                                              vectype_in, OPTIMIZE_FOR_SPEED))
6776         {
6777           if (dump_enabled_p ())
6778             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779                              "optimizing condition reduction with"
6780                              " FOLD_EXTRACT_LAST.\n");
6781           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6782         }
6783       else if (cond_reduc_dt == vect_induction_def)
6784         {
6785           tree base
6786             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6787           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6788
6789           gcc_assert (TREE_CODE (base) == INTEGER_CST
6790                       && TREE_CODE (step) == INTEGER_CST);
6791           cond_reduc_val = NULL_TREE;
6792           enum tree_code cond_reduc_op_code = ERROR_MARK;
6793           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6794           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6795             ;
6796           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6797              above base; punt if base is the minimum value of the type for
6798              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6799           else if (tree_int_cst_sgn (step) == -1)
6800             {
6801               cond_reduc_op_code = MIN_EXPR;
6802               if (tree_int_cst_sgn (base) == -1)
6803                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6804               else if (tree_int_cst_lt (base,
6805                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6806                 cond_reduc_val
6807                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6808             }
6809           else
6810             {
6811               cond_reduc_op_code = MAX_EXPR;
6812               if (tree_int_cst_sgn (base) == 1)
6813                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6814               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6815                                         base))
6816                 cond_reduc_val
6817                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6818             }
6819           if (cond_reduc_val)
6820             {
6821               if (dump_enabled_p ())
6822                 dump_printf_loc (MSG_NOTE, vect_location,
6823                                  "condition expression based on "
6824                                  "integer induction.\n");
6825               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6826               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6827                 = cond_reduc_val;
6828               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6829             }
6830         }
6831       else if (cond_reduc_dt == vect_constant_def)
6832         {
6833           enum vect_def_type cond_initial_dt;
6834           tree cond_initial_val
6835             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6836
6837           gcc_assert (cond_reduc_val != NULL_TREE);
6838           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6839           if (cond_initial_dt == vect_constant_def
6840               && types_compatible_p (TREE_TYPE (cond_initial_val),
6841                                      TREE_TYPE (cond_reduc_val)))
6842             {
6843               tree e = fold_binary (LE_EXPR, boolean_type_node,
6844                                     cond_initial_val, cond_reduc_val);
6845               if (e && (integer_onep (e) || integer_zerop (e)))
6846                 {
6847                   if (dump_enabled_p ())
6848                     dump_printf_loc (MSG_NOTE, vect_location,
6849                                      "condition expression based on "
6850                                      "compile time constant.\n");
6851                   /* Record reduction code at analysis stage.  */
6852                   STMT_VINFO_REDUC_CODE (reduc_info)
6853                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6854                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6855                 }
6856             }
6857         }
6858     }
6859
6860   if (STMT_VINFO_LIVE_P (phi_info))
6861     return false;
6862
6863   if (slp_node)
6864     ncopies = 1;
6865   else
6866     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867
6868   gcc_assert (ncopies >= 1);
6869
6870   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6871
6872   if (nested_cycle)
6873     {
6874       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6875                   == vect_double_reduction_def);
6876       double_reduc = true;
6877     }
6878
6879   /* 4.2. Check support for the epilog operation.
6880
6881           If STMT represents a reduction pattern, then the type of the
6882           reduction variable may be different than the type of the rest
6883           of the arguments.  For example, consider the case of accumulation
6884           of shorts into an int accumulator; The original code:
6885                         S1: int_a = (int) short_a;
6886           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6887
6888           was replaced with:
6889                         STMT: int_acc = widen_sum <short_a, int_acc>
6890
6891           This means that:
6892           1. The tree-code that is used to create the vector operation in the
6893              epilog code (that reduces the partial results) is not the
6894              tree-code of STMT, but is rather the tree-code of the original
6895              stmt from the pattern that STMT is replacing.  I.e, in the example
6896              above we want to use 'widen_sum' in the loop, but 'plus' in the
6897              epilog.
6898           2. The type (mode) we use to check available target support
6899              for the vector operation to be created in the *epilog*, is
6900              determined by the type of the reduction variable (in the example
6901              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6902              However the type (mode) we use to check available target support
6903              for the vector operation to be created *inside the loop*, is
6904              determined by the type of the other arguments to STMT (in the
6905              example we'd check this: optab_handler (widen_sum_optab,
6906              vect_short_mode)).
6907
6908           This is contrary to "regular" reductions, in which the types of all
6909           the arguments are the same as the type of the reduction variable.
6910           For "regular" reductions we can therefore use the same vector type
6911           (and also the same tree-code) when generating the epilog code and
6912           when generating the code inside the loop.  */
6913
6914   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6915   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6916
6917   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6918   if (reduction_type == TREE_CODE_REDUCTION)
6919     {
6920       /* Check whether it's ok to change the order of the computation.
6921          Generally, when vectorizing a reduction we change the order of the
6922          computation.  This may change the behavior of the program in some
6923          cases, so we need to check that this is ok.  One exception is when
6924          vectorizing an outer-loop: the inner-loop is executed sequentially,
6925          and therefore vectorizing reductions in the inner-loop during
6926          outer-loop vectorization is safe.  Likewise when we are vectorizing
6927          a series of reductions using SLP and the VF is one the reductions
6928          are performed in scalar order.  */
6929       if (slp_node
6930           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6931           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6932         ;
6933       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6934         {
6935           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6936              is not directy used in stmt.  */
6937           if (!only_slp_reduc_chain
6938               && reduc_chain_length != 1)
6939             {
6940               if (dump_enabled_p ())
6941                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942                                  "in-order reduction chain without SLP.\n");
6943               return false;
6944             }
6945           STMT_VINFO_REDUC_TYPE (reduc_info)
6946             = reduction_type = FOLD_LEFT_REDUCTION;
6947         }
6948       else if (!commutative_tree_code (orig_code)
6949                || !associative_tree_code (orig_code))
6950         {
6951           if (dump_enabled_p ())
6952             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6953                             "reduction: not commutative/associative");
6954           return false;
6955         }
6956     }
6957
6958   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6959       && ncopies > 1)
6960     {
6961       if (dump_enabled_p ())
6962         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6963                          "multiple types in double reduction or condition "
6964                          "reduction or fold-left reduction.\n");
6965       return false;
6966     }
6967
6968   internal_fn reduc_fn = IFN_LAST;
6969   if (reduction_type == TREE_CODE_REDUCTION
6970       || reduction_type == FOLD_LEFT_REDUCTION
6971       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6972       || reduction_type == CONST_COND_REDUCTION)
6973     {
6974       if (reduction_type == FOLD_LEFT_REDUCTION
6975           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6976           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6977         {
6978           if (reduc_fn != IFN_LAST
6979               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6980                                                   OPTIMIZE_FOR_SPEED))
6981             {
6982               if (dump_enabled_p ())
6983                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6984                                  "reduc op not supported by target.\n");
6985
6986               reduc_fn = IFN_LAST;
6987             }
6988         }
6989       else
6990         {
6991           if (!nested_cycle || double_reduc)
6992             {
6993               if (dump_enabled_p ())
6994                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995                                  "no reduc code for scalar code.\n");
6996
6997               return false;
6998             }
6999         }
7000     }
7001   else if (reduction_type == COND_REDUCTION)
7002     {
7003       int scalar_precision
7004         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7005       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7006       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7007                                                 vectype_out);
7008
7009       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7010                                           OPTIMIZE_FOR_SPEED))
7011         reduc_fn = IFN_REDUC_MAX;
7012     }
7013   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7014
7015   if (reduction_type != EXTRACT_LAST_REDUCTION
7016       && (!nested_cycle || double_reduc)
7017       && reduc_fn == IFN_LAST
7018       && !nunits_out.is_constant ())
7019     {
7020       if (dump_enabled_p ())
7021         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7022                          "missing target support for reduction on"
7023                          " variable-length vectors.\n");
7024       return false;
7025     }
7026
7027   /* For SLP reductions, see if there is a neutral value we can use.  */
7028   tree neutral_op = NULL_TREE;
7029   if (slp_node)
7030     neutral_op = neutral_op_for_slp_reduction
7031       (slp_node_instance->reduc_phis, vectype_out, orig_code,
7032        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7033
7034   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7035     {
7036       /* We can't support in-order reductions of code such as this:
7037
7038            for (int i = 0; i < n1; ++i)
7039              for (int j = 0; j < n2; ++j)
7040                l += a[j];
7041
7042          since GCC effectively transforms the loop when vectorizing:
7043
7044            for (int i = 0; i < n1 / VF; ++i)
7045              for (int j = 0; j < n2; ++j)
7046                for (int k = 0; k < VF; ++k)
7047                  l += a[j];
7048
7049          which is a reassociation of the original operation.  */
7050       if (dump_enabled_p ())
7051         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052                          "in-order double reduction not supported.\n");
7053
7054       return false;
7055     }
7056
7057   if (reduction_type == FOLD_LEFT_REDUCTION
7058       && slp_node
7059       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7060     {
7061       /* We cannot use in-order reductions in this case because there is
7062          an implicit reassociation of the operations involved.  */
7063       if (dump_enabled_p ())
7064         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065                          "in-order unchained SLP reductions not supported.\n");
7066       return false;
7067     }
7068
7069   /* For double reductions, and for SLP reductions with a neutral value,
7070      we construct a variable-length initial vector by loading a vector
7071      full of the neutral value and then shift-and-inserting the start
7072      values into the low-numbered elements.  */
7073   if ((double_reduc || neutral_op)
7074       && !nunits_out.is_constant ()
7075       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7076                                           vectype_out, OPTIMIZE_FOR_SPEED))
7077     {
7078       if (dump_enabled_p ())
7079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080                          "reduction on variable-length vectors requires"
7081                          " target support for a vector-shift-and-insert"
7082                          " operation.\n");
7083       return false;
7084     }
7085
7086   /* Check extra constraints for variable-length unchained SLP reductions.  */
7087   if (STMT_SLP_TYPE (stmt_info)
7088       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7089       && !nunits_out.is_constant ())
7090     {
7091       /* We checked above that we could build the initial vector when
7092          there's a neutral element value.  Check here for the case in
7093          which each SLP statement has its own initial value and in which
7094          that value needs to be repeated for every instance of the
7095          statement within the initial vector.  */
7096       unsigned int group_size = SLP_TREE_LANES (slp_node);
7097       if (!neutral_op
7098           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7099                                               TREE_TYPE (vectype_out)))
7100         {
7101           if (dump_enabled_p ())
7102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103                              "unsupported form of SLP reduction for"
7104                              " variable-length vectors: cannot build"
7105                              " initial vector.\n");
7106           return false;
7107         }
7108       /* The epilogue code relies on the number of elements being a multiple
7109          of the group size.  The duplicate-and-interleave approach to setting
7110          up the initial vector does too.  */
7111       if (!multiple_p (nunits_out, group_size))
7112         {
7113           if (dump_enabled_p ())
7114             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7115                              "unsupported form of SLP reduction for"
7116                              " variable-length vectors: the vector size"
7117                              " is not a multiple of the number of results.\n");
7118           return false;
7119         }
7120     }
7121
7122   if (reduction_type == COND_REDUCTION)
7123     {
7124       widest_int ni;
7125
7126       if (! max_loop_iterations (loop, &ni))
7127         {
7128           if (dump_enabled_p ())
7129             dump_printf_loc (MSG_NOTE, vect_location,
7130                              "loop count not known, cannot create cond "
7131                              "reduction.\n");
7132           return false;
7133         }
7134       /* Convert backedges to iterations.  */
7135       ni += 1;
7136
7137       /* The additional index will be the same type as the condition.  Check
7138          that the loop can fit into this less one (because we'll use up the
7139          zero slot for when there are no matches).  */
7140       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7141       if (wi::geu_p (ni, wi::to_widest (max_index)))
7142         {
7143           if (dump_enabled_p ())
7144             dump_printf_loc (MSG_NOTE, vect_location,
7145                              "loop size is greater than data size.\n");
7146           return false;
7147         }
7148     }
7149
7150   /* In case the vectorization factor (VF) is bigger than the number
7151      of elements that we can fit in a vectype (nunits), we have to generate
7152      more than one vector stmt - i.e - we need to "unroll" the
7153      vector stmt by a factor VF/nunits.  For more details see documentation
7154      in vectorizable_operation.  */
7155
7156   /* If the reduction is used in an outer loop we need to generate
7157      VF intermediate results, like so (e.g. for ncopies=2):
7158         r0 = phi (init, r0)
7159         r1 = phi (init, r1)
7160         r0 = x0 + r0;
7161         r1 = x1 + r1;
7162     (i.e. we generate VF results in 2 registers).
7163     In this case we have a separate def-use cycle for each copy, and therefore
7164     for each copy we get the vector def for the reduction variable from the
7165     respective phi node created for this copy.
7166
7167     Otherwise (the reduction is unused in the loop nest), we can combine
7168     together intermediate results, like so (e.g. for ncopies=2):
7169         r = phi (init, r)
7170         r = x0 + r;
7171         r = x1 + r;
7172    (i.e. we generate VF/2 results in a single register).
7173    In this case for each copy we get the vector def for the reduction variable
7174    from the vectorized reduction operation generated in the previous iteration.
7175
7176    This only works when we see both the reduction PHI and its only consumer
7177    in vectorizable_reduction and there are no intermediate stmts
7178    participating.  */
7179   if (ncopies > 1
7180       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7181       && reduc_chain_length == 1)
7182     single_defuse_cycle = true;
7183
7184   if (single_defuse_cycle || lane_reduc_code_p)
7185     {
7186       gcc_assert (code != COND_EXPR);
7187
7188       /* 4. Supportable by target?  */
7189       bool ok = true;
7190
7191       /* 4.1. check support for the operation in the loop  */
7192       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7193       if (!optab)
7194         {
7195           if (dump_enabled_p ())
7196             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7197                              "no optab.\n");
7198           ok = false;
7199         }
7200
7201       machine_mode vec_mode = TYPE_MODE (vectype_in);
7202       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7203         {
7204           if (dump_enabled_p ())
7205             dump_printf (MSG_NOTE, "op not supported by target.\n");
7206           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7207               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7208             ok = false;
7209           else
7210             if (dump_enabled_p ())
7211               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7212         }
7213
7214       /* Worthwhile without SIMD support?  */
7215       if (ok
7216           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7217           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7218         {
7219           if (dump_enabled_p ())
7220             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7221                              "not worthwhile without SIMD support.\n");
7222           ok = false;
7223         }
7224
7225       /* lane-reducing operations have to go through vect_transform_reduction.
7226          For the other cases try without the single cycle optimization.  */
7227       if (!ok)
7228         {
7229           if (lane_reduc_code_p)
7230             return false;
7231           else
7232             single_defuse_cycle = false;
7233         }
7234     }
7235   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7236
7237   /* If the reduction stmt is one of the patterns that have lane
7238      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7239   if ((ncopies > 1 && ! single_defuse_cycle)
7240       && lane_reduc_code_p)
7241     {
7242       if (dump_enabled_p ())
7243         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7244                          "multi def-use cycle not possible for lane-reducing "
7245                          "reduction operation\n");
7246       return false;
7247     }
7248
7249   if (slp_node
7250       && !(!single_defuse_cycle
7251            && code != DOT_PROD_EXPR
7252            && code != WIDEN_SUM_EXPR
7253            && code != SAD_EXPR
7254            && reduction_type != FOLD_LEFT_REDUCTION))
7255     for (i = 0; i < op_type; i++)
7256       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7257         {
7258           if (dump_enabled_p ())
7259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7260                              "incompatible vector types for invariants\n");
7261           return false;
7262         }
7263
7264   if (slp_node)
7265     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7266   else
7267     vec_num = 1;
7268
7269   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7270                              reduction_type, ncopies, cost_vec);
7271   /* Cost the reduction op inside the loop if transformed via
7272      vect_transform_reduction.  Otherwise this is costed by the
7273      separate vectorizable_* routines.  */
7274   if (single_defuse_cycle
7275       || code == DOT_PROD_EXPR
7276       || code == WIDEN_SUM_EXPR
7277       || code == SAD_EXPR)
7278     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7279
7280   if (dump_enabled_p ()
7281       && reduction_type == FOLD_LEFT_REDUCTION)
7282     dump_printf_loc (MSG_NOTE, vect_location,
7283                      "using an in-order (fold-left) reduction.\n");
7284   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7285   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7286      reductions go through their own vectorizable_* routines.  */
7287   if (!single_defuse_cycle
7288       && code != DOT_PROD_EXPR
7289       && code != WIDEN_SUM_EXPR
7290       && code != SAD_EXPR
7291       && reduction_type != FOLD_LEFT_REDUCTION)
7292     {
7293       stmt_vec_info tem
7294         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7295       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7296         {
7297           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7298           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7299         }
7300       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7301       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7302     }
7303   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7304     {
7305       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7306       internal_fn cond_fn = get_conditional_internal_fn (code);
7307
7308       if (reduction_type != FOLD_LEFT_REDUCTION
7309           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7310           && (cond_fn == IFN_LAST
7311               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7312                                                   OPTIMIZE_FOR_SPEED)))
7313         {
7314           if (dump_enabled_p ())
7315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316                              "can't operate on partial vectors because"
7317                              " no conditional operation is available.\n");
7318           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7319         }
7320       else if (reduction_type == FOLD_LEFT_REDUCTION
7321                && reduc_fn == IFN_LAST
7322                && !expand_vec_cond_expr_p (vectype_in,
7323                                            truth_type_for (vectype_in),
7324                                            SSA_NAME))
7325         {
7326           if (dump_enabled_p ())
7327             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328                              "can't operate on partial vectors because"
7329                              " no conditional operation is available.\n");
7330           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7331         }
7332       else
7333         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7334                                vectype_in, NULL);
7335     }
7336   return true;
7337 }
7338
7339 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7340    value.  */
7341
7342 bool
7343 vect_transform_reduction (loop_vec_info loop_vinfo,
7344                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7345                           gimple **vec_stmt, slp_tree slp_node)
7346 {
7347   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7348   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7349   int i;
7350   int ncopies;
7351   int vec_num;
7352
7353   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7354   gcc_assert (reduc_info->is_reduc_info);
7355
7356   if (nested_in_vect_loop_p (loop, stmt_info))
7357     {
7358       loop = loop->inner;
7359       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7360     }
7361
7362   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7363   enum tree_code code = gimple_assign_rhs_code (stmt);
7364   int op_type = TREE_CODE_LENGTH (code);
7365
7366   /* Flatten RHS.  */
7367   tree ops[3];
7368   switch (get_gimple_rhs_class (code))
7369     {
7370     case GIMPLE_TERNARY_RHS:
7371       ops[2] = gimple_assign_rhs3 (stmt);
7372       /* Fall thru.  */
7373     case GIMPLE_BINARY_RHS:
7374       ops[0] = gimple_assign_rhs1 (stmt);
7375       ops[1] = gimple_assign_rhs2 (stmt);
7376       break;
7377     default:
7378       gcc_unreachable ();
7379     }
7380
7381   /* All uses but the last are expected to be defined in the loop.
7382      The last use is the reduction variable.  In case of nested cycle this
7383      assumption is not true: we use reduc_index to record the index of the
7384      reduction variable.  */
7385   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7386   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7387   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7388   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7389
7390   if (slp_node)
7391     {
7392       ncopies = 1;
7393       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7394     }
7395   else
7396     {
7397       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7398       vec_num = 1;
7399     }
7400
7401   internal_fn cond_fn = get_conditional_internal_fn (code);
7402   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7403   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7404
7405   /* Transform.  */
7406   tree new_temp = NULL_TREE;
7407   auto_vec<tree> vec_oprnds0;
7408   auto_vec<tree> vec_oprnds1;
7409   auto_vec<tree> vec_oprnds2;
7410   tree def0;
7411
7412   if (dump_enabled_p ())
7413     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7414
7415   /* FORNOW: Multiple types are not supported for condition.  */
7416   if (code == COND_EXPR)
7417     gcc_assert (ncopies == 1);
7418
7419   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7420
7421   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7422   if (reduction_type == FOLD_LEFT_REDUCTION)
7423     {
7424       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7425       return vectorize_fold_left_reduction
7426           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7427            reduc_fn, ops, vectype_in, reduc_index, masks);
7428     }
7429
7430   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7431   gcc_assert (single_defuse_cycle
7432               || code == DOT_PROD_EXPR
7433               || code == WIDEN_SUM_EXPR
7434               || code == SAD_EXPR);
7435
7436   /* Create the destination vector  */
7437   tree scalar_dest = gimple_assign_lhs (stmt);
7438   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7439
7440   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7441                      single_defuse_cycle && reduc_index == 0
7442                      ? NULL_TREE : ops[0], &vec_oprnds0,
7443                      single_defuse_cycle && reduc_index == 1
7444                      ? NULL_TREE : ops[1], &vec_oprnds1,
7445                      op_type == ternary_op
7446                      && !(single_defuse_cycle && reduc_index == 2)
7447                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7448   if (single_defuse_cycle)
7449     {
7450       gcc_assert (!slp_node);
7451       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7452                                      ops[reduc_index],
7453                                      reduc_index == 0 ? &vec_oprnds0
7454                                      : (reduc_index == 1 ? &vec_oprnds1
7455                                         : &vec_oprnds2));
7456     }
7457
7458   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7459     {
7460       gimple *new_stmt;
7461       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7462       if (masked_loop_p && !mask_by_cond_expr)
7463         {
7464           /* Make sure that the reduction accumulator is vop[0].  */
7465           if (reduc_index == 1)
7466             {
7467               gcc_assert (commutative_tree_code (code));
7468               std::swap (vop[0], vop[1]);
7469             }
7470           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7471                                           vectype_in, i);
7472           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7473                                                     vop[0], vop[1], vop[0]);
7474           new_temp = make_ssa_name (vec_dest, call);
7475           gimple_call_set_lhs (call, new_temp);
7476           gimple_call_set_nothrow (call, true);
7477           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7478           new_stmt = call;
7479         }
7480       else
7481         {
7482           if (op_type == ternary_op)
7483             vop[2] = vec_oprnds2[i];
7484
7485           if (masked_loop_p && mask_by_cond_expr)
7486             {
7487               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7488                                               vectype_in, i);
7489               build_vect_cond_expr (code, vop, mask, gsi);
7490             }
7491
7492           new_stmt = gimple_build_assign (vec_dest, code,
7493                                           vop[0], vop[1], vop[2]);
7494           new_temp = make_ssa_name (vec_dest, new_stmt);
7495           gimple_assign_set_lhs (new_stmt, new_temp);
7496           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7497         }
7498
7499       if (slp_node)
7500         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7501       else if (single_defuse_cycle
7502                && i < ncopies - 1)
7503         {
7504           if (reduc_index == 0)
7505             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7506           else if (reduc_index == 1)
7507             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7508           else if (reduc_index == 2)
7509             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7510         }
7511       else
7512         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7513     }
7514
7515   if (!slp_node)
7516     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7517
7518   return true;
7519 }
7520
7521 /* Transform phase of a cycle PHI.  */
7522
7523 bool
7524 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7525                           stmt_vec_info stmt_info, gimple **vec_stmt,
7526                           slp_tree slp_node, slp_instance slp_node_instance)
7527 {
7528   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7529   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7530   int i;
7531   int ncopies;
7532   int j;
7533   bool nested_cycle = false;
7534   int vec_num;
7535
7536   if (nested_in_vect_loop_p (loop, stmt_info))
7537     {
7538       loop = loop->inner;
7539       nested_cycle = true;
7540     }
7541
7542   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7543   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7544   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7545   gcc_assert (reduc_info->is_reduc_info);
7546
7547   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7548       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7549     /* Leave the scalar phi in place.  */
7550     return true;
7551
7552   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7553   /* For a nested cycle we do not fill the above.  */
7554   if (!vectype_in)
7555     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7556   gcc_assert (vectype_in);
7557
7558   if (slp_node)
7559     {
7560       /* The size vect_schedule_slp_instance computes is off for us.  */
7561       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7562                                       * SLP_TREE_LANES (slp_node), vectype_in);
7563       ncopies = 1;
7564     }
7565   else
7566     {
7567       vec_num = 1;
7568       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7569     }
7570
7571   /* Check whether we should use a single PHI node and accumulate
7572      vectors to one before the backedge.  */
7573   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7574     ncopies = 1;
7575
7576   /* Create the destination vector  */
7577   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7578   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7579                                                vectype_out);
7580
7581   /* Get the loop-entry arguments.  */
7582   tree vec_initial_def;
7583   auto_vec<tree> vec_initial_defs;
7584   if (slp_node)
7585     {
7586       vec_initial_defs.reserve (vec_num);
7587       if (nested_cycle)
7588         {
7589           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7590           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7591                              &vec_initial_defs);
7592         }
7593       else
7594         {
7595           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7596           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7597           tree neutral_op
7598               = neutral_op_for_slp_reduction (slp_node, vectype_out,
7599                                               STMT_VINFO_REDUC_CODE (reduc_info),
7600                                               first != NULL);
7601           get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7602                                           &vec_initial_defs, vec_num,
7603                                           first != NULL, neutral_op);
7604         }
7605     }
7606   else
7607     {
7608       /* Get at the scalar def before the loop, that defines the initial
7609          value of the reduction variable.  */
7610       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7611                                                 loop_preheader_edge (loop));
7612       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7613          and we can't use zero for induc_val, use initial_def.  Similarly
7614          for REDUC_MIN and initial_def larger than the base.  */
7615       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7616         {
7617           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7618           if (TREE_CODE (initial_def) == INTEGER_CST
7619               && !integer_zerop (induc_val)
7620               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7621                    && tree_int_cst_lt (initial_def, induc_val))
7622                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7623                       && tree_int_cst_lt (induc_val, initial_def))))
7624             {
7625               induc_val = initial_def;
7626               /* Communicate we used the initial_def to epilouge
7627                  generation.  */
7628               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7629             }
7630           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7631           vec_initial_defs.create (ncopies);
7632           for (i = 0; i < ncopies; ++i)
7633             vec_initial_defs.quick_push (vec_initial_def);
7634         }
7635       else if (nested_cycle)
7636         {
7637           /* Do not use an adjustment def as that case is not supported
7638              correctly if ncopies is not one.  */
7639           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7640                                          ncopies, initial_def,
7641                                          &vec_initial_defs);
7642         }
7643       else
7644         {
7645           tree adjustment_def = NULL_TREE;
7646           tree *adjustment_defp = &adjustment_def;
7647           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7648           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7649             adjustment_defp = NULL;
7650           vec_initial_def
7651             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7652                                              initial_def, adjustment_defp);
7653           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7654           vec_initial_defs.create (ncopies);
7655           for (i = 0; i < ncopies; ++i)
7656             vec_initial_defs.quick_push (vec_initial_def);
7657         }
7658     }
7659
7660   /* Generate the reduction PHIs upfront.  */
7661   for (i = 0; i < vec_num; i++)
7662     {
7663       tree vec_init_def = vec_initial_defs[i];
7664       for (j = 0; j < ncopies; j++)
7665         {
7666           /* Create the reduction-phi that defines the reduction
7667              operand.  */
7668           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7669
7670           /* Set the loop-entry arg of the reduction-phi.  */
7671           if (j != 0 && nested_cycle)
7672             vec_init_def = vec_initial_defs[j];
7673           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7674                        UNKNOWN_LOCATION);
7675
7676           /* The loop-latch arg is set in epilogue processing.  */
7677
7678           if (slp_node)
7679             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7680           else
7681             {
7682               if (j == 0)
7683                 *vec_stmt = new_phi;
7684               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7685             }
7686         }
7687     }
7688
7689   return true;
7690 }
7691
7692 /* Vectorizes LC PHIs.  */
7693
7694 bool
7695 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7696                      stmt_vec_info stmt_info, gimple **vec_stmt,
7697                      slp_tree slp_node)
7698 {
7699   if (!loop_vinfo
7700       || !is_a <gphi *> (stmt_info->stmt)
7701       || gimple_phi_num_args (stmt_info->stmt) != 1)
7702     return false;
7703
7704   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7705       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7706     return false;
7707
7708   if (!vec_stmt) /* transformation not required.  */
7709     {
7710       /* Deal with copies from externs or constants that disguise as
7711          loop-closed PHI nodes (PR97886).  */
7712       if (slp_node
7713           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7714                                                 SLP_TREE_VECTYPE (slp_node)))
7715         {
7716           if (dump_enabled_p ())
7717             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7718                              "incompatible vector types for invariants\n");
7719           return false;
7720         }
7721       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7722       return true;
7723     }
7724
7725   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7726   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7727   basic_block bb = gimple_bb (stmt_info->stmt);
7728   edge e = single_pred_edge (bb);
7729   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7730   auto_vec<tree> vec_oprnds;
7731   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7732                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7733                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7734   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7735     {
7736       /* Create the vectorized LC PHI node.  */
7737       gphi *new_phi = create_phi_node (vec_dest, bb);
7738       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7739       if (slp_node)
7740         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7741       else
7742         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7743     }
7744   if (!slp_node)
7745     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7746
7747   return true;
7748 }
7749
7750 /* Vectorizes PHIs.  */
7751
7752 bool
7753 vectorizable_phi (vec_info *,
7754                   stmt_vec_info stmt_info, gimple **vec_stmt,
7755                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7756 {
7757   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7758     return false;
7759
7760   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7761     return false;
7762
7763   tree vectype = SLP_TREE_VECTYPE (slp_node);
7764
7765   if (!vec_stmt) /* transformation not required.  */
7766     {
7767       slp_tree child;
7768       unsigned i;
7769       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7770         if (!child)
7771           {
7772             if (dump_enabled_p ())
7773               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774                                "PHI node with unvectorized backedge def\n");
7775             return false;
7776           }
7777         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7778           {
7779             if (dump_enabled_p ())
7780               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7781                                "incompatible vector types for invariants\n");
7782             return false;
7783           }
7784       /* For single-argument PHIs assume coalescing which means zero cost
7785          for the scalar and the vector PHIs.  This avoids artificially
7786          favoring the vector path (but may pessimize it in some cases).  */
7787       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7788         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7789                           vector_stmt, stmt_info, vectype, 0, vect_body);
7790       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7791       return true;
7792     }
7793
7794   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7795   basic_block bb = gimple_bb (stmt_info->stmt);
7796   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7797   auto_vec<gphi *> new_phis;
7798   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7799     {
7800       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7801
7802       /* Skip not yet vectorized defs.  */
7803       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7804           && SLP_TREE_VEC_STMTS (child).is_empty ())
7805         continue;
7806
7807       auto_vec<tree> vec_oprnds;
7808       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7809       if (!new_phis.exists ())
7810         {
7811           new_phis.create (vec_oprnds.length ());
7812           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7813             {
7814               /* Create the vectorized LC PHI node.  */
7815               new_phis.quick_push (create_phi_node (vec_dest, bb));
7816               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7817             }
7818         }
7819       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7820       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7821         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7822     }
7823   /* We should have at least one already vectorized child.  */
7824   gcc_assert (new_phis.exists ());
7825
7826   return true;
7827 }
7828
7829
7830 /* Function vect_min_worthwhile_factor.
7831
7832    For a loop where we could vectorize the operation indicated by CODE,
7833    return the minimum vectorization factor that makes it worthwhile
7834    to use generic vectors.  */
7835 static unsigned int
7836 vect_min_worthwhile_factor (enum tree_code code)
7837 {
7838   switch (code)
7839     {
7840     case PLUS_EXPR:
7841     case MINUS_EXPR:
7842     case NEGATE_EXPR:
7843       return 4;
7844
7845     case BIT_AND_EXPR:
7846     case BIT_IOR_EXPR:
7847     case BIT_XOR_EXPR:
7848     case BIT_NOT_EXPR:
7849       return 2;
7850
7851     default:
7852       return INT_MAX;
7853     }
7854 }
7855
7856 /* Return true if VINFO indicates we are doing loop vectorization and if
7857    it is worth decomposing CODE operations into scalar operations for
7858    that loop's vectorization factor.  */
7859
7860 bool
7861 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7862 {
7863   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7864   unsigned HOST_WIDE_INT value;
7865   return (loop_vinfo
7866           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7867           && value >= vect_min_worthwhile_factor (code));
7868 }
7869
7870 /* Function vectorizable_induction
7871
7872    Check if STMT_INFO performs an induction computation that can be vectorized.
7873    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7874    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7875    Return true if STMT_INFO is vectorizable in this way.  */
7876
7877 bool
7878 vectorizable_induction (loop_vec_info loop_vinfo,
7879                         stmt_vec_info stmt_info,
7880                         gimple **vec_stmt, slp_tree slp_node,
7881                         stmt_vector_for_cost *cost_vec)
7882 {
7883   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7884   unsigned ncopies;
7885   bool nested_in_vect_loop = false;
7886   class loop *iv_loop;
7887   tree vec_def;
7888   edge pe = loop_preheader_edge (loop);
7889   basic_block new_bb;
7890   tree new_vec, vec_init, vec_step, t;
7891   tree new_name;
7892   gimple *new_stmt;
7893   gphi *induction_phi;
7894   tree induc_def, vec_dest;
7895   tree init_expr, step_expr;
7896   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7897   unsigned i;
7898   tree expr;
7899   gimple_stmt_iterator si;
7900
7901   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7902   if (!phi)
7903     return false;
7904
7905   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7906     return false;
7907
7908   /* Make sure it was recognized as induction computation.  */
7909   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7910     return false;
7911
7912   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7913   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7914
7915   if (slp_node)
7916     ncopies = 1;
7917   else
7918     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7919   gcc_assert (ncopies >= 1);
7920
7921   /* FORNOW. These restrictions should be relaxed.  */
7922   if (nested_in_vect_loop_p (loop, stmt_info))
7923     {
7924       imm_use_iterator imm_iter;
7925       use_operand_p use_p;
7926       gimple *exit_phi;
7927       edge latch_e;
7928       tree loop_arg;
7929
7930       if (ncopies > 1)
7931         {
7932           if (dump_enabled_p ())
7933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934                              "multiple types in nested loop.\n");
7935           return false;
7936         }
7937
7938       exit_phi = NULL;
7939       latch_e = loop_latch_edge (loop->inner);
7940       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7941       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7942         {
7943           gimple *use_stmt = USE_STMT (use_p);
7944           if (is_gimple_debug (use_stmt))
7945             continue;
7946
7947           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7948             {
7949               exit_phi = use_stmt;
7950               break;
7951             }
7952         }
7953       if (exit_phi)
7954         {
7955           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7956           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7957                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7958             {
7959               if (dump_enabled_p ())
7960                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961                                  "inner-loop induction only used outside "
7962                                  "of the outer vectorized loop.\n");
7963               return false;
7964             }
7965         }
7966
7967       nested_in_vect_loop = true;
7968       iv_loop = loop->inner;
7969     }
7970   else
7971     iv_loop = loop;
7972   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7973
7974   if (slp_node && !nunits.is_constant ())
7975     {
7976       /* The current SLP code creates the step value element-by-element.  */
7977       if (dump_enabled_p ())
7978         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979                          "SLP induction not supported for variable-length"
7980                          " vectors.\n");
7981       return false;
7982     }
7983
7984   if (!vec_stmt) /* transformation not required.  */
7985     {
7986       unsigned inside_cost = 0, prologue_cost = 0;
7987       if (slp_node)
7988         {
7989           /* We eventually need to set a vector type on invariant
7990              arguments.  */
7991           unsigned j;
7992           slp_tree child;
7993           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7994             if (!vect_maybe_update_slp_op_vectype
7995                 (child, SLP_TREE_VECTYPE (slp_node)))
7996               {
7997                 if (dump_enabled_p ())
7998                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999                                    "incompatible vector types for "
8000                                    "invariants\n");
8001                 return false;
8002               }
8003           /* loop cost for vec_loop.  */
8004           inside_cost
8005             = record_stmt_cost (cost_vec,
8006                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8007                                 vector_stmt, stmt_info, 0, vect_body);
8008           /* prologue cost for vec_init (if not nested) and step.  */
8009           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8010                                             scalar_to_vec,
8011                                             stmt_info, 0, vect_prologue);
8012         }
8013       else /* if (!slp_node) */
8014         {
8015           /* loop cost for vec_loop.  */
8016           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8017                                           stmt_info, 0, vect_body);
8018           /* prologue cost for vec_init and vec_step.  */
8019           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8020                                             stmt_info, 0, vect_prologue);
8021         }
8022       if (dump_enabled_p ())
8023         dump_printf_loc (MSG_NOTE, vect_location,
8024                          "vect_model_induction_cost: inside_cost = %d, "
8025                          "prologue_cost = %d .\n", inside_cost,
8026                          prologue_cost);
8027
8028       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8029       DUMP_VECT_SCOPE ("vectorizable_induction");
8030       return true;
8031     }
8032
8033   /* Transform.  */
8034
8035   /* Compute a vector variable, initialized with the first VF values of
8036      the induction variable.  E.g., for an iv with IV_PHI='X' and
8037      evolution S, for a vector of 4 units, we want to compute:
8038      [X, X + S, X + 2*S, X + 3*S].  */
8039
8040   if (dump_enabled_p ())
8041     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8042
8043   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8044   gcc_assert (step_expr != NULL_TREE);
8045   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8046
8047   pe = loop_preheader_edge (iv_loop);
8048   /* Find the first insertion point in the BB.  */
8049   basic_block bb = gimple_bb (phi);
8050   si = gsi_after_labels (bb);
8051
8052   /* For SLP induction we have to generate several IVs as for example
8053      with group size 3 we need
8054        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8055        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8056   if (slp_node)
8057     {
8058       /* Enforced above.  */
8059       unsigned int const_nunits = nunits.to_constant ();
8060
8061       /* The initial values are vectorized, but any lanes > group_size
8062          need adjustment.  */
8063       slp_tree init_node
8064         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8065
8066       /* Gather steps.  Since we do not vectorize inductions as
8067          cycles we have to reconstruct the step from SCEV data.  */
8068       unsigned group_size = SLP_TREE_LANES (slp_node);
8069       tree *steps = XALLOCAVEC (tree, group_size);
8070       tree *inits = XALLOCAVEC (tree, group_size);
8071       stmt_vec_info phi_info;
8072       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8073         {
8074           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8075           if (!init_node)
8076             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8077                                            pe->dest_idx);
8078         }
8079
8080       /* Now generate the IVs.  */
8081       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8082       gcc_assert ((const_nunits * nvects) % group_size == 0);
8083       unsigned nivs;
8084       if (nested_in_vect_loop)
8085         nivs = nvects;
8086       else
8087         {
8088           /* Compute the number of distinct IVs we need.  First reduce
8089              group_size if it is a multiple of const_nunits so we get
8090              one IV for a group_size of 4 but const_nunits 2.  */
8091           unsigned group_sizep = group_size;
8092           if (group_sizep % const_nunits == 0)
8093             group_sizep = group_sizep / const_nunits;
8094           nivs = least_common_multiple (group_sizep,
8095                                         const_nunits) / const_nunits;
8096         }
8097       tree stept = TREE_TYPE (step_vectype);
8098       tree lupdate_mul = NULL_TREE;
8099       if (!nested_in_vect_loop)
8100         {
8101           /* The number of iterations covered in one vector iteration.  */
8102           unsigned lup_mul = (nvects * const_nunits) / group_size;
8103           lupdate_mul
8104             = build_vector_from_val (step_vectype,
8105                                      SCALAR_FLOAT_TYPE_P (stept)
8106                                      ? build_real_from_wide (stept, lup_mul,
8107                                                              UNSIGNED)
8108                                      : build_int_cstu (stept, lup_mul));
8109         }
8110       tree peel_mul = NULL_TREE;
8111       gimple_seq init_stmts = NULL;
8112       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8113         {
8114           if (SCALAR_FLOAT_TYPE_P (stept))
8115             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8116                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8117           else
8118             peel_mul = gimple_convert (&init_stmts, stept,
8119                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8120           peel_mul = gimple_build_vector_from_val (&init_stmts,
8121                                                    step_vectype, peel_mul);
8122         }
8123       unsigned ivn;
8124       auto_vec<tree> vec_steps;
8125       for (ivn = 0; ivn < nivs; ++ivn)
8126         {
8127           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8128           tree_vector_builder init_elts (vectype, const_nunits, 1);
8129           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8130           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8131             {
8132               /* The scalar steps of the IVs.  */
8133               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8134               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8135               step_elts.quick_push (elt);
8136               if (!init_node)
8137                 {
8138                   /* The scalar inits of the IVs if not vectorized.  */
8139                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8140                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8141                                                   TREE_TYPE (elt)))
8142                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8143                                         TREE_TYPE (vectype), elt);
8144                   init_elts.quick_push (elt);
8145                 }
8146               /* The number of steps to add to the initial values.  */
8147               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8148               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8149                                    ? build_real_from_wide (stept,
8150                                                            mul_elt, UNSIGNED)
8151                                    : build_int_cstu (stept, mul_elt));
8152             }
8153           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8154           vec_steps.safe_push (vec_step);
8155           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8156           if (peel_mul)
8157             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8158                                      step_mul, peel_mul);
8159           if (!init_node)
8160             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8161
8162           /* Create the induction-phi that defines the induction-operand.  */
8163           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8164                                             "vec_iv_");
8165           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8166           induc_def = PHI_RESULT (induction_phi);
8167
8168           /* Create the iv update inside the loop  */
8169           tree up = vec_step;
8170           if (lupdate_mul)
8171             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8172                                vec_step, lupdate_mul);
8173           gimple_seq stmts = NULL;
8174           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8175           vec_def = gimple_build (&stmts,
8176                                   PLUS_EXPR, step_vectype, vec_def, up);
8177           vec_def = gimple_convert (&stmts, vectype, vec_def);
8178           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8179           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8180                        UNKNOWN_LOCATION);
8181
8182           if (init_node)
8183             vec_init = vect_get_slp_vect_def (init_node, ivn);
8184           if (!nested_in_vect_loop
8185               && !integer_zerop (step_mul))
8186             {
8187               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8188               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8189                                  vec_step, step_mul);
8190               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8191                                       vec_def, up);
8192               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8193             }
8194
8195           /* Set the arguments of the phi node:  */
8196           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8197
8198           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8199         }
8200       if (!nested_in_vect_loop)
8201         {
8202           /* Fill up to the number of vectors we need for the whole group.  */
8203           nivs = least_common_multiple (group_size,
8204                                         const_nunits) / const_nunits;
8205           for (; ivn < nivs; ++ivn)
8206             {
8207               SLP_TREE_VEC_STMTS (slp_node)
8208                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8209               vec_steps.safe_push (vec_steps[0]);
8210             }
8211         }
8212
8213       /* Re-use IVs when we can.  We are generating further vector
8214          stmts by adding VF' * stride to the IVs generated above.  */
8215       if (ivn < nvects)
8216         {
8217           unsigned vfp
8218             = least_common_multiple (group_size, const_nunits) / group_size;
8219           tree lupdate_mul
8220             = build_vector_from_val (step_vectype,
8221                                      SCALAR_FLOAT_TYPE_P (stept)
8222                                      ? build_real_from_wide (stept,
8223                                                              vfp, UNSIGNED)
8224                                      : build_int_cstu (stept, vfp));
8225           for (; ivn < nvects; ++ivn)
8226             {
8227               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8228               tree def = gimple_get_lhs (iv);
8229               if (ivn < 2*nivs)
8230                 vec_steps[ivn - nivs]
8231                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8232                                   vec_steps[ivn - nivs], lupdate_mul);
8233               gimple_seq stmts = NULL;
8234               def = gimple_convert (&stmts, step_vectype, def);
8235               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8236                                   def, vec_steps[ivn % nivs]);
8237               def = gimple_convert (&stmts, vectype, def);
8238               if (gimple_code (iv) == GIMPLE_PHI)
8239                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8240               else
8241                 {
8242                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8243                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8244                 }
8245               SLP_TREE_VEC_STMTS (slp_node)
8246                 .quick_push (SSA_NAME_DEF_STMT (def));
8247             }
8248         }
8249
8250       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8251       gcc_assert (!new_bb);
8252
8253       return true;
8254     }
8255
8256   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8257                                      loop_preheader_edge (iv_loop));
8258
8259   gimple_seq stmts = NULL;
8260   if (!nested_in_vect_loop)
8261     {
8262       /* Convert the initial value to the IV update type.  */
8263       tree new_type = TREE_TYPE (step_expr);
8264       init_expr = gimple_convert (&stmts, new_type, init_expr);
8265
8266       /* If we are using the loop mask to "peel" for alignment then we need
8267          to adjust the start value here.  */
8268       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8269       if (skip_niters != NULL_TREE)
8270         {
8271           if (FLOAT_TYPE_P (vectype))
8272             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8273                                         skip_niters);
8274           else
8275             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8276           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8277                                          skip_niters, step_expr);
8278           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8279                                     init_expr, skip_step);
8280         }
8281     }
8282
8283   if (stmts)
8284     {
8285       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8286       gcc_assert (!new_bb);
8287     }
8288
8289   /* Create the vector that holds the initial_value of the induction.  */
8290   if (nested_in_vect_loop)
8291     {
8292       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8293          been created during vectorization of previous stmts.  We obtain it
8294          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8295       auto_vec<tree> vec_inits;
8296       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8297                                      init_expr, &vec_inits);
8298       vec_init = vec_inits[0];
8299       /* If the initial value is not of proper type, convert it.  */
8300       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8301         {
8302           new_stmt
8303             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8304                                                           vect_simple_var,
8305                                                           "vec_iv_"),
8306                                    VIEW_CONVERT_EXPR,
8307                                    build1 (VIEW_CONVERT_EXPR, vectype,
8308                                            vec_init));
8309           vec_init = gimple_assign_lhs (new_stmt);
8310           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8311                                                  new_stmt);
8312           gcc_assert (!new_bb);
8313         }
8314     }
8315   else
8316     {
8317       /* iv_loop is the loop to be vectorized. Create:
8318          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8319       stmts = NULL;
8320       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8321
8322       unsigned HOST_WIDE_INT const_nunits;
8323       if (nunits.is_constant (&const_nunits))
8324         {
8325           tree_vector_builder elts (step_vectype, const_nunits, 1);
8326           elts.quick_push (new_name);
8327           for (i = 1; i < const_nunits; i++)
8328             {
8329               /* Create: new_name_i = new_name + step_expr  */
8330               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8331                                        new_name, step_expr);
8332               elts.quick_push (new_name);
8333             }
8334           /* Create a vector from [new_name_0, new_name_1, ...,
8335              new_name_nunits-1]  */
8336           vec_init = gimple_build_vector (&stmts, &elts);
8337         }
8338       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8339         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8340         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8341                                  new_name, step_expr);
8342       else
8343         {
8344           /* Build:
8345                 [base, base, base, ...]
8346                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8347           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8348           gcc_assert (flag_associative_math);
8349           tree index = build_index_vector (step_vectype, 0, 1);
8350           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8351                                                         new_name);
8352           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8353                                                         step_expr);
8354           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8355           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8356                                    vec_init, step_vec);
8357           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8358                                    vec_init, base_vec);
8359         }
8360       vec_init = gimple_convert (&stmts, vectype, vec_init);
8361
8362       if (stmts)
8363         {
8364           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8365           gcc_assert (!new_bb);
8366         }
8367     }
8368
8369
8370   /* Create the vector that holds the step of the induction.  */
8371   if (nested_in_vect_loop)
8372     /* iv_loop is nested in the loop to be vectorized. Generate:
8373        vec_step = [S, S, S, S]  */
8374     new_name = step_expr;
8375   else
8376     {
8377       /* iv_loop is the loop to be vectorized. Generate:
8378           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8379       gimple_seq seq = NULL;
8380       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8381         {
8382           expr = build_int_cst (integer_type_node, vf);
8383           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8384         }
8385       else
8386         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8387       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8388                                expr, step_expr);
8389       if (seq)
8390         {
8391           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8392           gcc_assert (!new_bb);
8393         }
8394     }
8395
8396   t = unshare_expr (new_name);
8397   gcc_assert (CONSTANT_CLASS_P (new_name)
8398               || TREE_CODE (new_name) == SSA_NAME);
8399   new_vec = build_vector_from_val (step_vectype, t);
8400   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8401                                new_vec, step_vectype, NULL);
8402
8403
8404   /* Create the following def-use cycle:
8405      loop prolog:
8406          vec_init = ...
8407          vec_step = ...
8408      loop:
8409          vec_iv = PHI <vec_init, vec_loop>
8410          ...
8411          STMT
8412          ...
8413          vec_loop = vec_iv + vec_step;  */
8414
8415   /* Create the induction-phi that defines the induction-operand.  */
8416   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8417   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8418   induc_def = PHI_RESULT (induction_phi);
8419
8420   /* Create the iv update inside the loop  */
8421   stmts = NULL;
8422   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8423   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8424   vec_def = gimple_convert (&stmts, vectype, vec_def);
8425   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8426   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8427
8428   /* Set the arguments of the phi node:  */
8429   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8430   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8431                UNKNOWN_LOCATION);
8432
8433   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8434   *vec_stmt = induction_phi;
8435
8436   /* In case that vectorization factor (VF) is bigger than the number
8437      of elements that we can fit in a vectype (nunits), we have to generate
8438      more than one vector stmt - i.e - we need to "unroll" the
8439      vector stmt by a factor VF/nunits.  For more details see documentation
8440      in vectorizable_operation.  */
8441
8442   if (ncopies > 1)
8443     {
8444       gimple_seq seq = NULL;
8445       /* FORNOW. This restriction should be relaxed.  */
8446       gcc_assert (!nested_in_vect_loop);
8447
8448       /* Create the vector that holds the step of the induction.  */
8449       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8450         {
8451           expr = build_int_cst (integer_type_node, nunits);
8452           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8453         }
8454       else
8455         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8456       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8457                                expr, step_expr);
8458       if (seq)
8459         {
8460           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8461           gcc_assert (!new_bb);
8462         }
8463
8464       t = unshare_expr (new_name);
8465       gcc_assert (CONSTANT_CLASS_P (new_name)
8466                   || TREE_CODE (new_name) == SSA_NAME);
8467       new_vec = build_vector_from_val (step_vectype, t);
8468       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8469                                    new_vec, step_vectype, NULL);
8470
8471       vec_def = induc_def;
8472       for (i = 1; i < ncopies; i++)
8473         {
8474           /* vec_i = vec_prev + vec_step  */
8475           gimple_seq stmts = NULL;
8476           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8477           vec_def = gimple_build (&stmts,
8478                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8479           vec_def = gimple_convert (&stmts, vectype, vec_def);
8480
8481           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8482           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8483           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8484         }
8485     }
8486
8487   if (dump_enabled_p ())
8488     dump_printf_loc (MSG_NOTE, vect_location,
8489                      "transform induction: created def-use cycle: %G%G",
8490                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8491
8492   return true;
8493 }
8494
8495 /* Function vectorizable_live_operation.
8496
8497    STMT_INFO computes a value that is used outside the loop.  Check if
8498    it can be supported.  */
8499
8500 bool
8501 vectorizable_live_operation (vec_info *vinfo,
8502                              stmt_vec_info stmt_info,
8503                              gimple_stmt_iterator *gsi,
8504                              slp_tree slp_node, slp_instance slp_node_instance,
8505                              int slp_index, bool vec_stmt_p,
8506                              stmt_vector_for_cost *cost_vec)
8507 {
8508   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8509   imm_use_iterator imm_iter;
8510   tree lhs, lhs_type, bitsize;
8511   tree vectype = (slp_node
8512                   ? SLP_TREE_VECTYPE (slp_node)
8513                   : STMT_VINFO_VECTYPE (stmt_info));
8514   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8515   int ncopies;
8516   gimple *use_stmt;
8517   auto_vec<tree> vec_oprnds;
8518   int vec_entry = 0;
8519   poly_uint64 vec_index = 0;
8520
8521   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8522
8523   /* If a stmt of a reduction is live, vectorize it via
8524      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8525      validity so just trigger the transform here.  */
8526   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8527     {
8528       if (!vec_stmt_p)
8529         return true;
8530       if (slp_node)
8531         {
8532           /* For reduction chains the meta-info is attached to
8533              the group leader.  */
8534           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8535             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8536           /* For SLP reductions we vectorize the epilogue for
8537              all involved stmts together.  */
8538           else if (slp_index != 0)
8539             return true;
8540           else
8541             /* For SLP reductions the meta-info is attached to
8542                the representative.  */
8543             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8544         }
8545       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8546       gcc_assert (reduc_info->is_reduc_info);
8547       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8548           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8549         return true;
8550       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8551                                         slp_node_instance);
8552       return true;
8553     }
8554
8555   /* If STMT is not relevant and it is a simple assignment and its inputs are
8556      invariant then it can remain in place, unvectorized.  The original last
8557      scalar value that it computes will be used.  */
8558   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8559     {
8560       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8561       if (dump_enabled_p ())
8562         dump_printf_loc (MSG_NOTE, vect_location,
8563                          "statement is simple and uses invariant.  Leaving in "
8564                          "place.\n");
8565       return true;
8566     }
8567
8568   if (slp_node)
8569     ncopies = 1;
8570   else
8571     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8572
8573   if (slp_node)
8574     {
8575       gcc_assert (slp_index >= 0);
8576
8577       /* Get the last occurrence of the scalar index from the concatenation of
8578          all the slp vectors. Calculate which slp vector it is and the index
8579          within.  */
8580       int num_scalar = SLP_TREE_LANES (slp_node);
8581       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8582       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8583
8584       /* Calculate which vector contains the result, and which lane of
8585          that vector we need.  */
8586       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8587         {
8588           if (dump_enabled_p ())
8589             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8590                              "Cannot determine which vector holds the"
8591                              " final result.\n");
8592           return false;
8593         }
8594     }
8595
8596   if (!vec_stmt_p)
8597     {
8598       /* No transformation required.  */
8599       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8600         {
8601           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8602                                                OPTIMIZE_FOR_SPEED))
8603             {
8604               if (dump_enabled_p ())
8605                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8606                                  "can't operate on partial vectors "
8607                                  "because the target doesn't support extract "
8608                                  "last reduction.\n");
8609               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8610             }
8611           else if (slp_node)
8612             {
8613               if (dump_enabled_p ())
8614                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8615                                  "can't operate on partial vectors "
8616                                  "because an SLP statement is live after "
8617                                  "the loop.\n");
8618               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8619             }
8620           else if (ncopies > 1)
8621             {
8622               if (dump_enabled_p ())
8623                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8624                                  "can't operate on partial vectors "
8625                                  "because ncopies is greater than 1.\n");
8626               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8627             }
8628           else
8629             {
8630               gcc_assert (ncopies == 1 && !slp_node);
8631               vect_record_loop_mask (loop_vinfo,
8632                                      &LOOP_VINFO_MASKS (loop_vinfo),
8633                                      1, vectype, NULL);
8634             }
8635         }
8636       /* ???  Enable for loop costing as well.  */
8637       if (!loop_vinfo)
8638         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8639                           0, vect_epilogue);
8640       return true;
8641     }
8642
8643   /* Use the lhs of the original scalar statement.  */
8644   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8645   if (dump_enabled_p ())
8646     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8647                      "stmt %G", stmt);
8648
8649   lhs = gimple_get_lhs (stmt);
8650   lhs_type = TREE_TYPE (lhs);
8651
8652   bitsize = vector_element_bits_tree (vectype);
8653
8654   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8655   tree vec_lhs, bitstart;
8656   gimple *vec_stmt;
8657   if (slp_node)
8658     {
8659       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8660
8661       /* Get the correct slp vectorized stmt.  */
8662       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8663       vec_lhs = gimple_get_lhs (vec_stmt);
8664
8665       /* Get entry to use.  */
8666       bitstart = bitsize_int (vec_index);
8667       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8668     }
8669   else
8670     {
8671       /* For multiple copies, get the last copy.  */
8672       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8673       vec_lhs = gimple_get_lhs (vec_stmt);
8674
8675       /* Get the last lane in the vector.  */
8676       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8677     }
8678
8679   if (loop_vinfo)
8680     {
8681       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8682          requirement, insert one phi node for it.  It looks like:
8683            loop;
8684          BB:
8685            # lhs' = PHI <lhs>
8686          ==>
8687            loop;
8688          BB:
8689            # vec_lhs' = PHI <vec_lhs>
8690            new_tree = lane_extract <vec_lhs', ...>;
8691            lhs' = new_tree;  */
8692
8693       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8694       basic_block exit_bb = single_exit (loop)->dest;
8695       gcc_assert (single_pred_p (exit_bb));
8696
8697       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8698       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8699       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8700
8701       gimple_seq stmts = NULL;
8702       tree new_tree;
8703       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8704         {
8705           /* Emit:
8706
8707                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8708
8709              where VEC_LHS is the vectorized live-out result and MASK is
8710              the loop mask for the final iteration.  */
8711           gcc_assert (ncopies == 1 && !slp_node);
8712           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8713           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8714                                           1, vectype, 0);
8715           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8716                                           mask, vec_lhs_phi);
8717
8718           /* Convert the extracted vector element to the scalar type.  */
8719           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8720         }
8721       else
8722         {
8723           tree bftype = TREE_TYPE (vectype);
8724           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8725             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8726           new_tree = build3 (BIT_FIELD_REF, bftype,
8727                              vec_lhs_phi, bitsize, bitstart);
8728           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8729                                            &stmts, true, NULL_TREE);
8730         }
8731
8732       if (stmts)
8733         {
8734           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8735           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8736
8737           /* Remove existing phi from lhs and create one copy from new_tree.  */
8738           tree lhs_phi = NULL_TREE;
8739           gimple_stmt_iterator gsi;
8740           for (gsi = gsi_start_phis (exit_bb);
8741                !gsi_end_p (gsi); gsi_next (&gsi))
8742             {
8743               gimple *phi = gsi_stmt (gsi);
8744               if ((gimple_phi_arg_def (phi, 0) == lhs))
8745                 {
8746                   remove_phi_node (&gsi, false);
8747                   lhs_phi = gimple_phi_result (phi);
8748                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8749                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8750                   break;
8751                 }
8752             }
8753         }
8754
8755       /* Replace use of lhs with newly computed result.  If the use stmt is a
8756          single arg PHI, just replace all uses of PHI result.  It's necessary
8757          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8758       use_operand_p use_p;
8759       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8760         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8761             && !is_gimple_debug (use_stmt))
8762           {
8763             if (gimple_code (use_stmt) == GIMPLE_PHI
8764                 && gimple_phi_num_args (use_stmt) == 1)
8765               {
8766                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8767               }
8768             else
8769               {
8770                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8771                     SET_USE (use_p, new_tree);
8772               }
8773             update_stmt (use_stmt);
8774           }
8775     }
8776   else
8777     {
8778       /* For basic-block vectorization simply insert the lane-extraction.  */
8779       tree bftype = TREE_TYPE (vectype);
8780       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8781         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8782       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8783                               vec_lhs, bitsize, bitstart);
8784       gimple_seq stmts = NULL;
8785       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8786                                        &stmts, true, NULL_TREE);
8787       if (TREE_CODE (new_tree) == SSA_NAME
8788           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8789         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8790       if (is_a <gphi *> (vec_stmt))
8791         {
8792           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8793           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8794         }
8795       else
8796         {
8797           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8798           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8799         }
8800
8801       /* Replace use of lhs with newly computed result.  If the use stmt is a
8802          single arg PHI, just replace all uses of PHI result.  It's necessary
8803          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8804       use_operand_p use_p;
8805       stmt_vec_info use_stmt_info;
8806       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8807         if (!is_gimple_debug (use_stmt)
8808             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8809                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8810           {
8811             /* ???  This can happen when the live lane ends up being
8812                used in a vector construction code-generated by an
8813                external SLP node (and code-generation for that already
8814                happened).  See gcc.dg/vect/bb-slp-47.c.
8815                Doing this is what would happen if that vector CTOR
8816                were not code-generated yet so it is not too bad.
8817                ???  In fact we'd likely want to avoid this situation
8818                in the first place.  */
8819             if (TREE_CODE (new_tree) == SSA_NAME
8820                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8821                 && gimple_code (use_stmt) != GIMPLE_PHI
8822                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8823                                                 use_stmt))
8824               {
8825                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8826                 gcc_assert (code == CONSTRUCTOR
8827                             || code == VIEW_CONVERT_EXPR
8828                             || CONVERT_EXPR_CODE_P (code));
8829                 if (dump_enabled_p ())
8830                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8831                                    "Using original scalar computation for "
8832                                    "live lane because use preceeds vector "
8833                                    "def\n");
8834                 continue;
8835               }
8836             /* ???  It can also happen that we end up pulling a def into
8837                a loop where replacing out-of-loop uses would require
8838                a new LC SSA PHI node.  Retain the original scalar in
8839                those cases as well.  PR98064.  */
8840             if (TREE_CODE (new_tree) == SSA_NAME
8841                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8842                 && (gimple_bb (use_stmt)->loop_father
8843                     != gimple_bb (vec_stmt)->loop_father)
8844                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8845                                         gimple_bb (use_stmt)->loop_father))
8846               {
8847                 if (dump_enabled_p ())
8848                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8849                                    "Using original scalar computation for "
8850                                    "live lane because there is an out-of-loop "
8851                                    "definition for it\n");
8852                 continue;
8853               }
8854             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8855               SET_USE (use_p, new_tree);
8856             update_stmt (use_stmt);
8857           }
8858     }
8859
8860   return true;
8861 }
8862
8863 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8864
8865 static void
8866 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8867 {
8868   ssa_op_iter op_iter;
8869   imm_use_iterator imm_iter;
8870   def_operand_p def_p;
8871   gimple *ustmt;
8872
8873   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8874     {
8875       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8876         {
8877           basic_block bb;
8878
8879           if (!is_gimple_debug (ustmt))
8880             continue;
8881
8882           bb = gimple_bb (ustmt);
8883
8884           if (!flow_bb_inside_loop_p (loop, bb))
8885             {
8886               if (gimple_debug_bind_p (ustmt))
8887                 {
8888                   if (dump_enabled_p ())
8889                     dump_printf_loc (MSG_NOTE, vect_location,
8890                                      "killing debug use\n");
8891
8892                   gimple_debug_bind_reset_value (ustmt);
8893                   update_stmt (ustmt);
8894                 }
8895               else
8896                 gcc_unreachable ();
8897             }
8898         }
8899     }
8900 }
8901
8902 /* Given loop represented by LOOP_VINFO, return true if computation of
8903    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8904    otherwise.  */
8905
8906 static bool
8907 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8908 {
8909   /* Constant case.  */
8910   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8911     {
8912       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8913       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8914
8915       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8916       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8917       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8918         return true;
8919     }
8920
8921   widest_int max;
8922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8923   /* Check the upper bound of loop niters.  */
8924   if (get_max_loop_iterations (loop, &max))
8925     {
8926       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8927       signop sgn = TYPE_SIGN (type);
8928       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8929       if (max < type_max)
8930         return true;
8931     }
8932   return false;
8933 }
8934
8935 /* Return a mask type with half the number of elements as OLD_TYPE,
8936    given that it should have mode NEW_MODE.  */
8937
8938 tree
8939 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8940 {
8941   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8942   return build_truth_vector_type_for_mode (nunits, new_mode);
8943 }
8944
8945 /* Return a mask type with twice as many elements as OLD_TYPE,
8946    given that it should have mode NEW_MODE.  */
8947
8948 tree
8949 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8950 {
8951   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8952   return build_truth_vector_type_for_mode (nunits, new_mode);
8953 }
8954
8955 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8956    contain a sequence of NVECTORS masks that each control a vector of type
8957    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8958    these vector masks with the vector version of SCALAR_MASK.  */
8959
8960 void
8961 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8962                        unsigned int nvectors, tree vectype, tree scalar_mask)
8963 {
8964   gcc_assert (nvectors != 0);
8965   if (masks->length () < nvectors)
8966     masks->safe_grow_cleared (nvectors, true);
8967   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8968   /* The number of scalars per iteration and the number of vectors are
8969      both compile-time constants.  */
8970   unsigned int nscalars_per_iter
8971     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8972                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8973
8974   if (scalar_mask)
8975     {
8976       scalar_cond_masked_key cond (scalar_mask, nvectors);
8977       loop_vinfo->scalar_cond_masked_set.add (cond);
8978     }
8979
8980   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8981     {
8982       rgm->max_nscalars_per_iter = nscalars_per_iter;
8983       rgm->type = truth_type_for (vectype);
8984       rgm->factor = 1;
8985     }
8986 }
8987
8988 /* Given a complete set of masks MASKS, extract mask number INDEX
8989    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8990    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8991
8992    See the comment above vec_loop_masks for more details about the mask
8993    arrangement.  */
8994
8995 tree
8996 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8997                     unsigned int nvectors, tree vectype, unsigned int index)
8998 {
8999   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9000   tree mask_type = rgm->type;
9001
9002   /* Populate the rgroup's mask array, if this is the first time we've
9003      used it.  */
9004   if (rgm->controls.is_empty ())
9005     {
9006       rgm->controls.safe_grow_cleared (nvectors, true);
9007       for (unsigned int i = 0; i < nvectors; ++i)
9008         {
9009           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9010           /* Provide a dummy definition until the real one is available.  */
9011           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9012           rgm->controls[i] = mask;
9013         }
9014     }
9015
9016   tree mask = rgm->controls[index];
9017   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9018                 TYPE_VECTOR_SUBPARTS (vectype)))
9019     {
9020       /* A loop mask for data type X can be reused for data type Y
9021          if X has N times more elements than Y and if Y's elements
9022          are N times bigger than X's.  In this case each sequence
9023          of N elements in the loop mask will be all-zero or all-one.
9024          We can then view-convert the mask so that each sequence of
9025          N elements is replaced by a single element.  */
9026       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9027                               TYPE_VECTOR_SUBPARTS (vectype)));
9028       gimple_seq seq = NULL;
9029       mask_type = truth_type_for (vectype);
9030       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9031       if (seq)
9032         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9033     }
9034   return mask;
9035 }
9036
9037 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9038    lengths for controlling an operation on VECTYPE.  The operation splits
9039    each element of VECTYPE into FACTOR separate subelements, measuring the
9040    length as a number of these subelements.  */
9041
9042 void
9043 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9044                       unsigned int nvectors, tree vectype, unsigned int factor)
9045 {
9046   gcc_assert (nvectors != 0);
9047   if (lens->length () < nvectors)
9048     lens->safe_grow_cleared (nvectors, true);
9049   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9050
9051   /* The number of scalars per iteration, scalar occupied bytes and
9052      the number of vectors are both compile-time constants.  */
9053   unsigned int nscalars_per_iter
9054     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9055                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9056
9057   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9058     {
9059       /* For now, we only support cases in which all loads and stores fall back
9060          to VnQI or none do.  */
9061       gcc_assert (!rgl->max_nscalars_per_iter
9062                   || (rgl->factor == 1 && factor == 1)
9063                   || (rgl->max_nscalars_per_iter * rgl->factor
9064                       == nscalars_per_iter * factor));
9065       rgl->max_nscalars_per_iter = nscalars_per_iter;
9066       rgl->type = vectype;
9067       rgl->factor = factor;
9068     }
9069 }
9070
9071 /* Given a complete set of length LENS, extract length number INDEX for an
9072    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9073
9074 tree
9075 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9076                    unsigned int nvectors, unsigned int index)
9077 {
9078   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9079
9080   /* Populate the rgroup's len array, if this is the first time we've
9081      used it.  */
9082   if (rgl->controls.is_empty ())
9083     {
9084       rgl->controls.safe_grow_cleared (nvectors, true);
9085       for (unsigned int i = 0; i < nvectors; ++i)
9086         {
9087           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9088           gcc_assert (len_type != NULL_TREE);
9089           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9090
9091           /* Provide a dummy definition until the real one is available.  */
9092           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9093           rgl->controls[i] = len;
9094         }
9095     }
9096
9097   return rgl->controls[index];
9098 }
9099
9100 /* Scale profiling counters by estimation for LOOP which is vectorized
9101    by factor VF.  */
9102
9103 static void
9104 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9105 {
9106   edge preheader = loop_preheader_edge (loop);
9107   /* Reduce loop iterations by the vectorization factor.  */
9108   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9109   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9110
9111   if (freq_h.nonzero_p ())
9112     {
9113       profile_probability p;
9114
9115       /* Avoid dropping loop body profile counter to 0 because of zero count
9116          in loop's preheader.  */
9117       if (!(freq_e == profile_count::zero ()))
9118         freq_e = freq_e.force_nonzero ();
9119       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9120       scale_loop_frequencies (loop, p);
9121     }
9122
9123   edge exit_e = single_exit (loop);
9124   exit_e->probability = profile_probability::always ()
9125                                  .apply_scale (1, new_est_niter + 1);
9126
9127   edge exit_l = single_pred_edge (loop->latch);
9128   profile_probability prob = exit_l->probability;
9129   exit_l->probability = exit_e->probability.invert ();
9130   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9131     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9132 }
9133
9134 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9135    latch edge values originally defined by it.  */
9136
9137 static void
9138 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9139                                      stmt_vec_info def_stmt_info)
9140 {
9141   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9142   if (!def || TREE_CODE (def) != SSA_NAME)
9143     return;
9144   stmt_vec_info phi_info;
9145   imm_use_iterator iter;
9146   use_operand_p use_p;
9147   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9148     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9149       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9150           && (phi_info = loop_vinfo->lookup_stmt (phi))
9151           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9152           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9153           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9154         {
9155           loop_p loop = gimple_bb (phi)->loop_father;
9156           edge e = loop_latch_edge (loop);
9157           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9158             {
9159               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9160               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9161               gcc_assert (phi_defs.length () == latch_defs.length ());
9162               for (unsigned i = 0; i < phi_defs.length (); ++i)
9163                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9164                              gimple_get_lhs (latch_defs[i]), e,
9165                              gimple_phi_arg_location (phi, e->dest_idx));
9166             }
9167         }
9168 }
9169
9170 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9171    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9172    stmt_vec_info.  */
9173
9174 static bool
9175 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9176                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9177 {
9178   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9179   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9180
9181   if (dump_enabled_p ())
9182     dump_printf_loc (MSG_NOTE, vect_location,
9183                      "------>vectorizing statement: %G", stmt_info->stmt);
9184
9185   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9186     vect_loop_kill_debug_uses (loop, stmt_info);
9187
9188   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9189       && !STMT_VINFO_LIVE_P (stmt_info))
9190     return false;
9191
9192   if (STMT_VINFO_VECTYPE (stmt_info))
9193     {
9194       poly_uint64 nunits
9195         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9196       if (!STMT_SLP_TYPE (stmt_info)
9197           && maybe_ne (nunits, vf)
9198           && dump_enabled_p ())
9199         /* For SLP VF is set according to unrolling factor, and not
9200            to vector size, hence for SLP this print is not valid.  */
9201         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9202     }
9203
9204   /* Pure SLP statements have already been vectorized.  We still need
9205      to apply loop vectorization to hybrid SLP statements.  */
9206   if (PURE_SLP_STMT (stmt_info))
9207     return false;
9208
9209   if (dump_enabled_p ())
9210     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9211
9212   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9213     *seen_store = stmt_info;
9214
9215   return true;
9216 }
9217
9218 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9219    in the hash_map with its corresponding values.  */
9220
9221 static tree
9222 find_in_mapping (tree t, void *context)
9223 {
9224   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9225
9226   tree *value = mapping->get (t);
9227   return value ? *value : t;
9228 }
9229
9230 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9231    original loop that has now been vectorized.
9232
9233    The inits of the data_references need to be advanced with the number of
9234    iterations of the main loop.  This has been computed in vect_do_peeling and
9235    is stored in parameter ADVANCE.  We first restore the data_references
9236    initial offset with the values recored in ORIG_DRS_INIT.
9237
9238    Since the loop_vec_info of this EPILOGUE was constructed for the original
9239    loop, its stmt_vec_infos all point to the original statements.  These need
9240    to be updated to point to their corresponding copies as well as the SSA_NAMES
9241    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9242
9243    The data_reference's connections also need to be updated.  Their
9244    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9245    stmt_vec_infos, their statements need to point to their corresponding copy,
9246    if they are gather loads or scatter stores then their reference needs to be
9247    updated to point to its corresponding copy and finally we set
9248    'base_misaligned' to false as we have already peeled for alignment in the
9249    prologue of the main loop.  */
9250
9251 static void
9252 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9253 {
9254   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9255   auto_vec<gimple *> stmt_worklist;
9256   hash_map<tree,tree> mapping;
9257   gimple *orig_stmt, *new_stmt;
9258   gimple_stmt_iterator epilogue_gsi;
9259   gphi_iterator epilogue_phi_gsi;
9260   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9261   basic_block *epilogue_bbs = get_loop_body (epilogue);
9262   unsigned i;
9263
9264   free (LOOP_VINFO_BBS (epilogue_vinfo));
9265   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9266
9267   /* Advance data_reference's with the number of iterations of the previous
9268      loop and its prologue.  */
9269   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9270
9271
9272   /* The EPILOGUE loop is a copy of the original loop so they share the same
9273      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9274      point to the copied statements.  We also create a mapping of all LHS' in
9275      the original loop and all the LHS' in the EPILOGUE and create worklists to
9276      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9277   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9278     {
9279       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9280            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9281         {
9282           new_stmt = epilogue_phi_gsi.phi ();
9283
9284           gcc_assert (gimple_uid (new_stmt) > 0);
9285           stmt_vinfo
9286             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9287
9288           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9289           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9290
9291           mapping.put (gimple_phi_result (orig_stmt),
9292                        gimple_phi_result (new_stmt));
9293           /* PHI nodes can not have patterns or related statements.  */
9294           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9295                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9296         }
9297
9298       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9299            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9300         {
9301           new_stmt = gsi_stmt (epilogue_gsi);
9302           if (is_gimple_debug (new_stmt))
9303             continue;
9304
9305           gcc_assert (gimple_uid (new_stmt) > 0);
9306           stmt_vinfo
9307             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9308
9309           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9310           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9311
9312           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9313             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9314
9315           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9316             {
9317               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9318               for (gimple_stmt_iterator gsi = gsi_start (seq);
9319                    !gsi_end_p (gsi); gsi_next (&gsi))
9320                 stmt_worklist.safe_push (gsi_stmt (gsi));
9321             }
9322
9323           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9324           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9325             {
9326               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9327               stmt_worklist.safe_push (stmt);
9328               /* Set BB such that the assert in
9329                 'get_initial_def_for_reduction' is able to determine that
9330                 the BB of the related stmt is inside this loop.  */
9331               gimple_set_bb (stmt,
9332                              gimple_bb (new_stmt));
9333               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9334               gcc_assert (related_vinfo == NULL
9335                           || related_vinfo == stmt_vinfo);
9336             }
9337         }
9338     }
9339
9340   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9341      using the original main loop and thus need to be updated to refer to the
9342      cloned variables used in the epilogue.  */
9343   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9344     {
9345       gimple *stmt = stmt_worklist[i];
9346       tree *new_op;
9347
9348       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9349         {
9350           tree op = gimple_op (stmt, j);
9351           if ((new_op = mapping.get(op)))
9352             gimple_set_op (stmt, j, *new_op);
9353           else
9354             {
9355               /* PR92429: The last argument of simplify_replace_tree disables
9356                  folding when replacing arguments.  This is required as
9357                  otherwise you might end up with different statements than the
9358                  ones analyzed in vect_loop_analyze, leading to different
9359                  vectorization.  */
9360               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9361                                           &find_in_mapping, &mapping, false);
9362               gimple_set_op (stmt, j, op);
9363             }
9364         }
9365     }
9366
9367   struct data_reference *dr;
9368   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9369   FOR_EACH_VEC_ELT (datarefs, i, dr)
9370     {
9371       orig_stmt = DR_STMT (dr);
9372       gcc_assert (gimple_uid (orig_stmt) > 0);
9373       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9374       /* Data references for gather loads and scatter stores do not use the
9375          updated offset we set using ADVANCE.  Instead we have to make sure the
9376          reference in the data references point to the corresponding copy of
9377          the original in the epilogue.  */
9378       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9379           == VMAT_GATHER_SCATTER)
9380         {
9381           DR_REF (dr)
9382             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9383                                      &find_in_mapping, &mapping);
9384           DR_BASE_ADDRESS (dr)
9385             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9386                                      &find_in_mapping, &mapping);
9387         }
9388       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9389       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9390       /* The vector size of the epilogue is smaller than that of the main loop
9391          so the alignment is either the same or lower. This means the dr will
9392          thus by definition be aligned.  */
9393       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9394     }
9395
9396   epilogue_vinfo->shared->datarefs_copy.release ();
9397   epilogue_vinfo->shared->save_datarefs ();
9398 }
9399
9400 /* Function vect_transform_loop.
9401
9402    The analysis phase has determined that the loop is vectorizable.
9403    Vectorize the loop - created vectorized stmts to replace the scalar
9404    stmts in the loop, and update the loop exit condition.
9405    Returns scalar epilogue loop if any.  */
9406
9407 class loop *
9408 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9409 {
9410   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9411   class loop *epilogue = NULL;
9412   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9413   int nbbs = loop->num_nodes;
9414   int i;
9415   tree niters_vector = NULL_TREE;
9416   tree step_vector = NULL_TREE;
9417   tree niters_vector_mult_vf = NULL_TREE;
9418   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9419   unsigned int lowest_vf = constant_lower_bound (vf);
9420   gimple *stmt;
9421   bool check_profitability = false;
9422   unsigned int th;
9423
9424   DUMP_VECT_SCOPE ("vec_transform_loop");
9425
9426   loop_vinfo->shared->check_datarefs ();
9427
9428   /* Use the more conservative vectorization threshold.  If the number
9429      of iterations is constant assume the cost check has been performed
9430      by our caller.  If the threshold makes all loops profitable that
9431      run at least the (estimated) vectorization factor number of times
9432      checking is pointless, too.  */
9433   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9434   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9435     {
9436       if (dump_enabled_p ())
9437         dump_printf_loc (MSG_NOTE, vect_location,
9438                          "Profitability threshold is %d loop iterations.\n",
9439                          th);
9440       check_profitability = true;
9441     }
9442
9443   /* Make sure there exists a single-predecessor exit bb.  Do this before
9444      versioning.   */
9445   edge e = single_exit (loop);
9446   if (! single_pred_p (e->dest))
9447     {
9448       split_loop_exit_edge (e, true);
9449       if (dump_enabled_p ())
9450         dump_printf (MSG_NOTE, "split exit edge\n");
9451     }
9452
9453   /* Version the loop first, if required, so the profitability check
9454      comes first.  */
9455
9456   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9457     {
9458       class loop *sloop
9459         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9460       sloop->force_vectorize = false;
9461       check_profitability = false;
9462     }
9463
9464   /* Make sure there exists a single-predecessor exit bb also on the
9465      scalar loop copy.  Do this after versioning but before peeling
9466      so CFG structure is fine for both scalar and if-converted loop
9467      to make slpeel_duplicate_current_defs_from_edges face matched
9468      loop closed PHI nodes on the exit.  */
9469   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9470     {
9471       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9472       if (! single_pred_p (e->dest))
9473         {
9474           split_loop_exit_edge (e, true);
9475           if (dump_enabled_p ())
9476             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9477         }
9478     }
9479
9480   tree niters = vect_build_loop_niters (loop_vinfo);
9481   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9482   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9483   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9484   tree advance;
9485   drs_init_vec orig_drs_init;
9486
9487   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9488                               &step_vector, &niters_vector_mult_vf, th,
9489                               check_profitability, niters_no_overflow,
9490                               &advance);
9491
9492   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9493       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9494     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9495                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9496
9497   if (niters_vector == NULL_TREE)
9498     {
9499       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9500           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9501           && known_eq (lowest_vf, vf))
9502         {
9503           niters_vector
9504             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9505                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9506           step_vector = build_one_cst (TREE_TYPE (niters));
9507         }
9508       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9509         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9510                                      &step_vector, niters_no_overflow);
9511       else
9512         /* vect_do_peeling subtracted the number of peeled prologue
9513            iterations from LOOP_VINFO_NITERS.  */
9514         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9515                                      &niters_vector, &step_vector,
9516                                      niters_no_overflow);
9517     }
9518
9519   /* 1) Make sure the loop header has exactly two entries
9520      2) Make sure we have a preheader basic block.  */
9521
9522   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9523
9524   split_edge (loop_preheader_edge (loop));
9525
9526   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9527     /* This will deal with any possible peeling.  */
9528     vect_prepare_for_masked_peels (loop_vinfo);
9529
9530   /* Schedule the SLP instances first, then handle loop vectorization
9531      below.  */
9532   if (!loop_vinfo->slp_instances.is_empty ())
9533     {
9534       DUMP_VECT_SCOPE ("scheduling SLP instances");
9535       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9536     }
9537
9538   /* FORNOW: the vectorizer supports only loops which body consist
9539      of one basic block (header + empty latch). When the vectorizer will
9540      support more involved loop forms, the order by which the BBs are
9541      traversed need to be reconsidered.  */
9542
9543   for (i = 0; i < nbbs; i++)
9544     {
9545       basic_block bb = bbs[i];
9546       stmt_vec_info stmt_info;
9547
9548       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9549            gsi_next (&si))
9550         {
9551           gphi *phi = si.phi ();
9552           if (dump_enabled_p ())
9553             dump_printf_loc (MSG_NOTE, vect_location,
9554                              "------>vectorizing phi: %G", phi);
9555           stmt_info = loop_vinfo->lookup_stmt (phi);
9556           if (!stmt_info)
9557             continue;
9558
9559           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9560             vect_loop_kill_debug_uses (loop, stmt_info);
9561
9562           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9563               && !STMT_VINFO_LIVE_P (stmt_info))
9564             continue;
9565
9566           if (STMT_VINFO_VECTYPE (stmt_info)
9567               && (maybe_ne
9568                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9569               && dump_enabled_p ())
9570             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9571
9572           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9573                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9574                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9575                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9576                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9577               && ! PURE_SLP_STMT (stmt_info))
9578             {
9579               if (dump_enabled_p ())
9580                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9581               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9582             }
9583         }
9584
9585       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9586            gsi_next (&si))
9587         {
9588           gphi *phi = si.phi ();
9589           stmt_info = loop_vinfo->lookup_stmt (phi);
9590           if (!stmt_info)
9591             continue;
9592
9593           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9594               && !STMT_VINFO_LIVE_P (stmt_info))
9595             continue;
9596
9597           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9598                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9599                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9600                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9601                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9602               && ! PURE_SLP_STMT (stmt_info))
9603             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9604         }
9605
9606       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9607            !gsi_end_p (si);)
9608         {
9609           stmt = gsi_stmt (si);
9610           /* During vectorization remove existing clobber stmts.  */
9611           if (gimple_clobber_p (stmt))
9612             {
9613               unlink_stmt_vdef (stmt);
9614               gsi_remove (&si, true);
9615               release_defs (stmt);
9616             }
9617           else
9618             {
9619               /* Ignore vector stmts created in the outer loop.  */
9620               stmt_info = loop_vinfo->lookup_stmt (stmt);
9621
9622               /* vector stmts created in the outer-loop during vectorization of
9623                  stmts in an inner-loop may not have a stmt_info, and do not
9624                  need to be vectorized.  */
9625               stmt_vec_info seen_store = NULL;
9626               if (stmt_info)
9627                 {
9628                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9629                     {
9630                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9631                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9632                            !gsi_end_p (subsi); gsi_next (&subsi))
9633                         {
9634                           stmt_vec_info pat_stmt_info
9635                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9636                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9637                                                     &si, &seen_store);
9638                         }
9639                       stmt_vec_info pat_stmt_info
9640                         = STMT_VINFO_RELATED_STMT (stmt_info);
9641                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9642                                                     &si, &seen_store))
9643                         maybe_set_vectorized_backedge_value (loop_vinfo,
9644                                                              pat_stmt_info);
9645                     }
9646                   else
9647                     {
9648                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9649                                                     &seen_store))
9650                         maybe_set_vectorized_backedge_value (loop_vinfo,
9651                                                              stmt_info);
9652                     }
9653                 }
9654               gsi_next (&si);
9655               if (seen_store)
9656                 {
9657                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9658                     /* Interleaving.  If IS_STORE is TRUE, the
9659                        vectorization of the interleaving chain was
9660                        completed - free all the stores in the chain.  */
9661                     vect_remove_stores (loop_vinfo,
9662                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9663                   else
9664                     /* Free the attached stmt_vec_info and remove the stmt.  */
9665                     loop_vinfo->remove_stmt (stmt_info);
9666                 }
9667             }
9668         }
9669
9670       /* Stub out scalar statements that must not survive vectorization.
9671          Doing this here helps with grouped statements, or statements that
9672          are involved in patterns.  */
9673       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9674            !gsi_end_p (gsi); gsi_next (&gsi))
9675         {
9676           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9677           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9678             {
9679               tree lhs = gimple_get_lhs (call);
9680               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9681                 {
9682                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9683                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9684                   gsi_replace (&gsi, new_stmt, true);
9685                 }
9686             }
9687         }
9688     }                           /* BBs in loop */
9689
9690   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9691      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9692   if (integer_onep (step_vector))
9693     niters_no_overflow = true;
9694   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9695                            niters_vector_mult_vf, !niters_no_overflow);
9696
9697   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9698   scale_profile_for_vect_loop (loop, assumed_vf);
9699
9700   /* True if the final iteration might not handle a full vector's
9701      worth of scalar iterations.  */
9702   bool final_iter_may_be_partial
9703     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9704   /* The minimum number of iterations performed by the epilogue.  This
9705      is 1 when peeling for gaps because we always need a final scalar
9706      iteration.  */
9707   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9708   /* +1 to convert latch counts to loop iteration counts,
9709      -min_epilogue_iters to remove iterations that cannot be performed
9710        by the vector code.  */
9711   int bias_for_lowest = 1 - min_epilogue_iters;
9712   int bias_for_assumed = bias_for_lowest;
9713   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9714   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9715     {
9716       /* When the amount of peeling is known at compile time, the first
9717          iteration will have exactly alignment_npeels active elements.
9718          In the worst case it will have at least one.  */
9719       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9720       bias_for_lowest += lowest_vf - min_first_active;
9721       bias_for_assumed += assumed_vf - min_first_active;
9722     }
9723   /* In these calculations the "- 1" converts loop iteration counts
9724      back to latch counts.  */
9725   if (loop->any_upper_bound)
9726     loop->nb_iterations_upper_bound
9727       = (final_iter_may_be_partial
9728          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9729                           lowest_vf) - 1
9730          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9731                            lowest_vf) - 1);
9732   if (loop->any_likely_upper_bound)
9733     loop->nb_iterations_likely_upper_bound
9734       = (final_iter_may_be_partial
9735          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9736                           + bias_for_lowest, lowest_vf) - 1
9737          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9738                            + bias_for_lowest, lowest_vf) - 1);
9739   if (loop->any_estimate)
9740     loop->nb_iterations_estimate
9741       = (final_iter_may_be_partial
9742          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9743                           assumed_vf) - 1
9744          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9745                            assumed_vf) - 1);
9746
9747   if (dump_enabled_p ())
9748     {
9749       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9750         {
9751           dump_printf_loc (MSG_NOTE, vect_location,
9752                            "LOOP VECTORIZED\n");
9753           if (loop->inner)
9754             dump_printf_loc (MSG_NOTE, vect_location,
9755                              "OUTER LOOP VECTORIZED\n");
9756           dump_printf (MSG_NOTE, "\n");
9757         }
9758       else
9759         dump_printf_loc (MSG_NOTE, vect_location,
9760                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9761                          GET_MODE_NAME (loop_vinfo->vector_mode));
9762     }
9763
9764   /* Loops vectorized with a variable factor won't benefit from
9765      unrolling/peeling.  */
9766   if (!vf.is_constant ())
9767     {
9768       loop->unroll = 1;
9769       if (dump_enabled_p ())
9770         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9771                          " variable-length vectorization factor\n");
9772     }
9773   /* Free SLP instances here because otherwise stmt reference counting
9774      won't work.  */
9775   slp_instance instance;
9776   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9777     vect_free_slp_instance (instance);
9778   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9779   /* Clear-up safelen field since its value is invalid after vectorization
9780      since vectorized loop can have loop-carried dependencies.  */
9781   loop->safelen = 0;
9782
9783   if (epilogue)
9784     {
9785       update_epilogue_loop_vinfo (epilogue, advance);
9786
9787       epilogue->simduid = loop->simduid;
9788       epilogue->force_vectorize = loop->force_vectorize;
9789       epilogue->dont_vectorize = false;
9790     }
9791
9792   return epilogue;
9793 }
9794
9795 /* The code below is trying to perform simple optimization - revert
9796    if-conversion for masked stores, i.e. if the mask of a store is zero
9797    do not perform it and all stored value producers also if possible.
9798    For example,
9799      for (i=0; i<n; i++)
9800        if (c[i])
9801         {
9802           p1[i] += 1;
9803           p2[i] = p3[i] +2;
9804         }
9805    this transformation will produce the following semi-hammock:
9806
9807    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9808      {
9809        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9810        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9811        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9812        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9813        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9814        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9815      }
9816 */
9817
9818 void
9819 optimize_mask_stores (class loop *loop)
9820 {
9821   basic_block *bbs = get_loop_body (loop);
9822   unsigned nbbs = loop->num_nodes;
9823   unsigned i;
9824   basic_block bb;
9825   class loop *bb_loop;
9826   gimple_stmt_iterator gsi;
9827   gimple *stmt;
9828   auto_vec<gimple *> worklist;
9829   auto_purge_vect_location sentinel;
9830
9831   vect_location = find_loop_location (loop);
9832   /* Pick up all masked stores in loop if any.  */
9833   for (i = 0; i < nbbs; i++)
9834     {
9835       bb = bbs[i];
9836       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9837            gsi_next (&gsi))
9838         {
9839           stmt = gsi_stmt (gsi);
9840           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9841             worklist.safe_push (stmt);
9842         }
9843     }
9844
9845   free (bbs);
9846   if (worklist.is_empty ())
9847     return;
9848
9849   /* Loop has masked stores.  */
9850   while (!worklist.is_empty ())
9851     {
9852       gimple *last, *last_store;
9853       edge e, efalse;
9854       tree mask;
9855       basic_block store_bb, join_bb;
9856       gimple_stmt_iterator gsi_to;
9857       tree vdef, new_vdef;
9858       gphi *phi;
9859       tree vectype;
9860       tree zero;
9861
9862       last = worklist.pop ();
9863       mask = gimple_call_arg (last, 2);
9864       bb = gimple_bb (last);
9865       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9866          the same loop as if_bb.  It could be different to LOOP when two
9867          level loop-nest is vectorized and mask_store belongs to the inner
9868          one.  */
9869       e = split_block (bb, last);
9870       bb_loop = bb->loop_father;
9871       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9872       join_bb = e->dest;
9873       store_bb = create_empty_bb (bb);
9874       add_bb_to_loop (store_bb, bb_loop);
9875       e->flags = EDGE_TRUE_VALUE;
9876       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9877       /* Put STORE_BB to likely part.  */
9878       efalse->probability = profile_probability::unlikely ();
9879       store_bb->count = efalse->count ();
9880       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9881       if (dom_info_available_p (CDI_DOMINATORS))
9882         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9883       if (dump_enabled_p ())
9884         dump_printf_loc (MSG_NOTE, vect_location,
9885                          "Create new block %d to sink mask stores.",
9886                          store_bb->index);
9887       /* Create vector comparison with boolean result.  */
9888       vectype = TREE_TYPE (mask);
9889       zero = build_zero_cst (vectype);
9890       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9891       gsi = gsi_last_bb (bb);
9892       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9893       /* Create new PHI node for vdef of the last masked store:
9894          .MEM_2 = VDEF <.MEM_1>
9895          will be converted to
9896          .MEM.3 = VDEF <.MEM_1>
9897          and new PHI node will be created in join bb
9898          .MEM_2 = PHI <.MEM_1, .MEM_3>
9899       */
9900       vdef = gimple_vdef (last);
9901       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9902       gimple_set_vdef (last, new_vdef);
9903       phi = create_phi_node (vdef, join_bb);
9904       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9905
9906       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9907       while (true)
9908         {
9909           gimple_stmt_iterator gsi_from;
9910           gimple *stmt1 = NULL;
9911
9912           /* Move masked store to STORE_BB.  */
9913           last_store = last;
9914           gsi = gsi_for_stmt (last);
9915           gsi_from = gsi;
9916           /* Shift GSI to the previous stmt for further traversal.  */
9917           gsi_prev (&gsi);
9918           gsi_to = gsi_start_bb (store_bb);
9919           gsi_move_before (&gsi_from, &gsi_to);
9920           /* Setup GSI_TO to the non-empty block start.  */
9921           gsi_to = gsi_start_bb (store_bb);
9922           if (dump_enabled_p ())
9923             dump_printf_loc (MSG_NOTE, vect_location,
9924                              "Move stmt to created bb\n%G", last);
9925           /* Move all stored value producers if possible.  */
9926           while (!gsi_end_p (gsi))
9927             {
9928               tree lhs;
9929               imm_use_iterator imm_iter;
9930               use_operand_p use_p;
9931               bool res;
9932
9933               /* Skip debug statements.  */
9934               if (is_gimple_debug (gsi_stmt (gsi)))
9935                 {
9936                   gsi_prev (&gsi);
9937                   continue;
9938                 }
9939               stmt1 = gsi_stmt (gsi);
9940               /* Do not consider statements writing to memory or having
9941                  volatile operand.  */
9942               if (gimple_vdef (stmt1)
9943                   || gimple_has_volatile_ops (stmt1))
9944                 break;
9945               gsi_from = gsi;
9946               gsi_prev (&gsi);
9947               lhs = gimple_get_lhs (stmt1);
9948               if (!lhs)
9949                 break;
9950
9951               /* LHS of vectorized stmt must be SSA_NAME.  */
9952               if (TREE_CODE (lhs) != SSA_NAME)
9953                 break;
9954
9955               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9956                 {
9957                   /* Remove dead scalar statement.  */
9958                   if (has_zero_uses (lhs))
9959                     {
9960                       gsi_remove (&gsi_from, true);
9961                       continue;
9962                     }
9963                 }
9964
9965               /* Check that LHS does not have uses outside of STORE_BB.  */
9966               res = true;
9967               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9968                 {
9969                   gimple *use_stmt;
9970                   use_stmt = USE_STMT (use_p);
9971                   if (is_gimple_debug (use_stmt))
9972                     continue;
9973                   if (gimple_bb (use_stmt) != store_bb)
9974                     {
9975                       res = false;
9976                       break;
9977                     }
9978                 }
9979               if (!res)
9980                 break;
9981
9982               if (gimple_vuse (stmt1)
9983                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9984                 break;
9985
9986               /* Can move STMT1 to STORE_BB.  */
9987               if (dump_enabled_p ())
9988                 dump_printf_loc (MSG_NOTE, vect_location,
9989                                  "Move stmt to created bb\n%G", stmt1);
9990               gsi_move_before (&gsi_from, &gsi_to);
9991               /* Shift GSI_TO for further insertion.  */
9992               gsi_prev (&gsi_to);
9993             }
9994           /* Put other masked stores with the same mask to STORE_BB.  */
9995           if (worklist.is_empty ()
9996               || gimple_call_arg (worklist.last (), 2) != mask
9997               || worklist.last () != stmt1)
9998             break;
9999           last = worklist.pop ();
10000         }
10001       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10002     }
10003 }
10004
10005 /* Decide whether it is possible to use a zero-based induction variable
10006    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10007    the value that the induction variable must be able to hold in order
10008    to ensure that the rgroups eventually have no active vector elements.
10009    Return -1 otherwise.  */
10010
10011 widest_int
10012 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10013 {
10014   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10015   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10016   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10017
10018   /* Calculate the value that the induction variable must be able
10019      to hit in order to ensure that we end the loop with an all-false mask.
10020      This involves adding the maximum number of inactive trailing scalar
10021      iterations.  */
10022   widest_int iv_limit = -1;
10023   if (max_loop_iterations (loop, &iv_limit))
10024     {
10025       if (niters_skip)
10026         {
10027           /* Add the maximum number of skipped iterations to the
10028              maximum iteration count.  */
10029           if (TREE_CODE (niters_skip) == INTEGER_CST)
10030             iv_limit += wi::to_widest (niters_skip);
10031           else
10032             iv_limit += max_vf - 1;
10033         }
10034       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10035         /* Make a conservatively-correct assumption.  */
10036         iv_limit += max_vf - 1;
10037
10038       /* IV_LIMIT is the maximum number of latch iterations, which is also
10039          the maximum in-range IV value.  Round this value down to the previous
10040          vector alignment boundary and then add an extra full iteration.  */
10041       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10042       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10043     }
10044   return iv_limit;
10045 }
10046
10047 /* For the given rgroup_controls RGC, check whether an induction variable
10048    would ever hit a value that produces a set of all-false masks or zero
10049    lengths before wrapping around.  Return true if it's possible to wrap
10050    around before hitting the desirable value, otherwise return false.  */
10051
10052 bool
10053 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10054 {
10055   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10056
10057   if (iv_limit == -1)
10058     return true;
10059
10060   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10061   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10062   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10063
10064   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10065     return true;
10066
10067   return false;
10068 }