gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *, bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf)
 168 {
 169   gimple *stmt = stmt_info->stmt;
 170
 171   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 172        && !STMT_VINFO_LIVE_P (stmt_info))
 173       || gimple_clobber_p (stmt))
 174     {
 175       if (dump_enabled_p ())
 176         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 177       return opt_result::success ();
 178     }
 179
 180   tree stmt_vectype, nunits_vectype;
 181   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 182                                                    &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  Return true on success
 209    or false if something prevented vectorization.  */
 210
 211 static opt_result
 212 vect_determine_vf_for_stmt (vec_info *vinfo,
 213                             stmt_vec_info stmt_info, poly_uint64 *vf)
 214 {
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 217                      stmt_info->stmt);
 218   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 219   if (!res)
 220     return res;
 221
 222   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 223       && STMT_VINFO_RELATED_STMT (stmt_info))
 224     {
 225       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 226       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 227
 228       /* If a pattern statement has def stmts, analyze them too.  */
 229       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 230            !gsi_end_p (si); gsi_next (&si))
 231         {
 232           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 233           if (dump_enabled_p ())
 234             dump_printf_loc (MSG_NOTE, vect_location,
 235                              "==> examining pattern def stmt: %G",
 236                              def_stmt_info->stmt);
 237           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 238           if (!res)
 239             return res;
 240         }
 241
 242       if (dump_enabled_p ())
 243         dump_printf_loc (MSG_NOTE, vect_location,
 244                          "==> examining pattern statement: %G",
 245                          stmt_info->stmt);
 246       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 247       if (!res)
 248         return res;
 249     }
 250
 251   return opt_result::success ();
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static opt_result
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291
 292   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 293
 294   for (i = 0; i < nbbs; i++)
 295     {
 296       basic_block bb = bbs[i];
 297
 298       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 299            gsi_next (&si))
 300         {
 301           phi = si.phi ();
 302           stmt_info = loop_vinfo->lookup_stmt (phi);
 303           if (dump_enabled_p ())
 304             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 305                              phi);
 306
 307           gcc_assert (stmt_info);
 308
 309           if (STMT_VINFO_RELEVANT_P (stmt_info)
 310               || STMT_VINFO_LIVE_P (stmt_info))
 311             {
 312               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 313               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 314
 315               if (dump_enabled_p ())
 316                 dump_printf_loc (MSG_NOTE, vect_location,
 317                                  "get vectype for scalar type:  %T\n",
 318                                  scalar_type);
 319
 320               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 321               if (!vectype)
 322                 return opt_result::failure_at (phi,
 323                                                "not vectorized: unsupported "
 324                                                "data-type %T\n",
 325                                                scalar_type);
 326               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 327
 328               if (dump_enabled_p ())
 329                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 330                                  vectype);
 331
 332               if (dump_enabled_p ())
 333                 {
 334                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 335                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 336                   dump_printf (MSG_NOTE, "\n");
 337                 }
 338
 339               vect_update_max_nunits (&vectorization_factor, vectype);
 340             }
 341         }
 342
 343       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 344            gsi_next (&si))
 345         {
 346           if (is_gimple_debug (gsi_stmt (si)))
 347             continue;
 348           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 349           opt_result res
 350             = vect_determine_vf_for_stmt (loop_vinfo,
 351                                           stmt_info, &vectorization_factor);
 352           if (!res)
 353             return res;
 354         }
 355     }
 356
 357   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 358   if (dump_enabled_p ())
 359     {
 360       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 361       dump_dec (MSG_NOTE, vectorization_factor);
 362       dump_printf (MSG_NOTE, "\n");
 363     }
 364
 365   if (known_le (vectorization_factor, 1U))
 366     return opt_result::failure_at (vect_location,
 367                                    "not vectorized: unsupported data-type\n");
 368   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 369   return opt_result::success ();
 370 }
 371
 372
 373 /* Function vect_is_simple_iv_evolution.
 374
 375    FORNOW: A simple evolution of an induction variables in the loop is
 376    considered a polynomial evolution.  */
 377
 378 static bool
 379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 380                              tree * step)
 381 {
 382   tree init_expr;
 383   tree step_expr;
 384   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 385   basic_block bb;
 386
 387   /* When there is no evolution in this loop, the evolution function
 388      is not "simple".  */
 389   if (evolution_part == NULL_TREE)
 390     return false;
 391
 392   /* When the evolution is a polynomial of degree >= 2
 393      the evolution function is not "simple".  */
 394   if (tree_is_chrec (evolution_part))
 395     return false;
 396
 397   step_expr = evolution_part;
 398   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 399
 400   if (dump_enabled_p ())
 401     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 402                      step_expr, init_expr);
 403
 404   *init = init_expr;
 405   *step = step_expr;
 406
 407   if (TREE_CODE (step_expr) != INTEGER_CST
 408       && (TREE_CODE (step_expr) != SSA_NAME
 409           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 410               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 411           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 412               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 413                   || !flag_associative_math)))
 414       && (TREE_CODE (step_expr) != REAL_CST
 415           || !flag_associative_math))
 416     {
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                          "step unknown.\n");
 420       return false;
 421     }
 422
 423   return true;
 424 }
 425
 426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 427    what we are assuming is a double reduction.  For example, given
 428    a structure like this:
 429
 430       outer1:
 431         x_1 = PHI <x_4(outer2), ...>;
 432         ...
 433
 434       inner:
 435         x_2 = PHI <x_1(outer1), ...>;
 436         ...
 437         x_3 = ...;
 438         ...
 439
 440       outer2:
 441         x_4 = PHI <x_3(inner)>;
 442         ...
 443
 444    outer loop analysis would treat x_1 as a double reduction phi and
 445    this function would then return true for x_2.  */
 446
 447 static bool
 448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 449 {
 450   use_operand_p use_p;
 451   ssa_op_iter op_iter;
 452   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 453     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 454       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 455         return true;
 456   return false;
 457 }
 458
 459 /* Function vect_analyze_scalar_cycles_1.
 460
 461    Examine the cross iteration def-use cycles of scalar variables
 462    in LOOP.  LOOP_VINFO represents the loop that is now being
 463    considered for vectorization (can be LOOP, or an outer-loop
 464    enclosing LOOP).  */
 465
 466 static void
 467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 468 {
 469   basic_block bb = loop->header;
 470   tree init, step;
 471   auto_vec<stmt_vec_info, 64> worklist;
 472   gphi_iterator gsi;
 473   bool double_reduc, reduc_chain;
 474
 475   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 476
 477   /* First - identify all inductions.  Reduction detection assumes that all the
 478      inductions have been identified, therefore, this order must not be
 479      changed.  */
 480   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 481     {
 482       gphi *phi = gsi.phi ();
 483       tree access_fn = NULL;
 484       tree def = PHI_RESULT (phi);
 485       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 489
 490       /* Skip virtual phi's.  The data dependences that are associated with
 491          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 492       if (virtual_operand_p (def))
 493         continue;
 494
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 496
 497       /* Analyze the evolution function.  */
 498       access_fn = analyze_scalar_evolution (loop, def);
 499       if (access_fn)
 500         {
 501           STRIP_NOPS (access_fn);
 502           if (dump_enabled_p ())
 503             dump_printf_loc (MSG_NOTE, vect_location,
 504                              "Access function of PHI: %T\n", access_fn);
 505           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 506             = initial_condition_in_loop_num (access_fn, loop->num);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 508             = evolution_part_in_loop_num (access_fn, loop->num);
 509         }
 510
 511       if (!access_fn
 512           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 513           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 514           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 515               && TREE_CODE (step) != INTEGER_CST))
 516         {
 517           worklist.safe_push (stmt_vinfo);
 518           continue;
 519         }
 520
 521       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522                   != NULL_TREE);
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 524
 525       if (dump_enabled_p ())
 526         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 527       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 528     }
 529
 530
 531   /* Second - identify all reductions and nested cycles.  */
 532   while (worklist.length () > 0)
 533     {
 534       stmt_vec_info stmt_vinfo = worklist.pop ();
 535       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 536       tree def = PHI_RESULT (phi);
 537
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 540
 541       gcc_assert (!virtual_operand_p (def)
 542                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 543
 544       stmt_vec_info reduc_stmt_info
 545         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 546                                     &reduc_chain);
 547       if (reduc_stmt_info)
 548         {
 549           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 550           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 551           if (double_reduc)
 552             {
 553               if (dump_enabled_p ())
 554                 dump_printf_loc (MSG_NOTE, vect_location,
 555                                  "Detected double reduction.\n");
 556
 557               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 558               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 559             }
 560           else
 561             {
 562               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 563                 {
 564                   if (dump_enabled_p ())
 565                     dump_printf_loc (MSG_NOTE, vect_location,
 566                                      "Detected vectorizable nested cycle.\n");
 567
 568                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 569                 }
 570               else
 571                 {
 572                   if (dump_enabled_p ())
 573                     dump_printf_loc (MSG_NOTE, vect_location,
 574                                      "Detected reduction.\n");
 575
 576                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 577                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 578                   /* Store the reduction cycles for possible vectorization in
 579                      loop-aware SLP if it was not detected as reduction
 580                      chain.  */
 581                   if (! reduc_chain)
 582                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 583                       (reduc_stmt_info);
 584                 }
 585             }
 586         }
 587       else
 588         if (dump_enabled_p ())
 589           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 590                            "Unknown def-use cycle pattern.\n");
 591     }
 592 }
 593
 594
 595 /* Function vect_analyze_scalar_cycles.
 596
 597    Examine the cross iteration def-use cycles of scalar variables, by
 598    analyzing the loop-header PHIs of scalar variables.  Classify each
 599    cycle as one of the following: invariant, induction, reduction, unknown.
 600    We do that for the loop represented by LOOP_VINFO, and also to its
 601    inner-loop, if exists.
 602    Examples for scalar cycles:
 603
 604    Example1: reduction:
 605
 606               loop1:
 607               for (i=0; i<N; i++)
 608                  sum += a[i];
 609
 610    Example2: induction:
 611
 612               loop2:
 613               for (i=0; i<N; i++)
 614                  a[i] = i;  */
 615
 616 static void
 617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 618 {
 619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 620
 621   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 622
 623   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 624      Reductions in such inner-loop therefore have different properties than
 625      the reductions in the nest that gets vectorized:
 626      1. When vectorized, they are executed in the same order as in the original
 627         scalar loop, so we can't change the order of computation when
 628         vectorizing them.
 629      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 630         current checks are too strict.  */
 631
 632   if (loop->inner)
 633     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 634 }
 635
 636 /* Transfer group and reduction information from STMT_INFO to its
 637    pattern stmt.  */
 638
 639 static void
 640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 641 {
 642   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 643   stmt_vec_info stmtp;
 644   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 645               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 646   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 647   do
 648     {
 649       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 650       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 651                            == STMT_VINFO_DEF_TYPE (stmt_info));
 652       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 653       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 654       if (stmt_info)
 655         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 656           = STMT_VINFO_RELATED_STMT (stmt_info);
 657     }
 658   while (stmt_info);
 659 }
 660
 661 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 662
 663 static void
 664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 665 {
 666   stmt_vec_info first;
 667   unsigned i;
 668
 669   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 670     {
 671       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672       while (next)
 673         {
 674           if ((STMT_VINFO_IN_PATTERN_P (next)
 675                != STMT_VINFO_IN_PATTERN_P (first))
 676               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 677             break;
 678           next = REDUC_GROUP_NEXT_ELEMENT (next);
 679         }
 680       /* If all reduction chain members are well-formed patterns adjust
 681          the group to group the pattern stmts instead.  */
 682       if (! next
 683           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 684         {
 685           if (STMT_VINFO_IN_PATTERN_P (first))
 686             {
 687               vect_fixup_reduc_chain (first);
 688               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 689                 = STMT_VINFO_RELATED_STMT (first);
 690             }
 691         }
 692       /* If not all stmt in the chain are patterns or if we failed
 693          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 694          it as regular reduction instead.  */
 695       else
 696         {
 697           stmt_vec_info vinfo = first;
 698           stmt_vec_info last = NULL;
 699           while (vinfo)
 700             {
 701               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 702               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 703               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 704               last = vinfo;
 705               vinfo = next;
 706             }
 707           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 708             = vect_internal_def;
 709           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 710           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 711           --i;
 712         }
 713     }
 714 }
 715
 716 /* Function vect_get_loop_niters.
 717
 718    Determine how many iterations the loop is executed and place it
 719    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 720    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 721    niter information holds in ASSUMPTIONS.
 722
 723    Return the loop exit condition.  */
 724
 725
 726 static gcond *
 727 vect_get_loop_niters (class loop *loop, tree *assumptions,
 728                       tree *number_of_iterations, tree *number_of_iterationsm1)
 729 {
 730   edge exit = single_exit (loop);
 731   class tree_niter_desc niter_desc;
 732   tree niter_assumptions, niter, may_be_zero;
 733   gcond *cond = get_loop_exit_condition (loop);
 734
 735   *assumptions = boolean_true_node;
 736   *number_of_iterationsm1 = chrec_dont_know;
 737   *number_of_iterations = chrec_dont_know;
 738   DUMP_VECT_SCOPE ("get_loop_niters");
 739
 740   if (!exit)
 741     return cond;
 742
 743   may_be_zero = NULL_TREE;
 744   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 745       || chrec_contains_undetermined (niter_desc.niter))
 746     return cond;
 747
 748   niter_assumptions = niter_desc.assumptions;
 749   may_be_zero = niter_desc.may_be_zero;
 750   niter = niter_desc.niter;
 751
 752   if (may_be_zero && integer_zerop (may_be_zero))
 753     may_be_zero = NULL_TREE;
 754
 755   if (may_be_zero)
 756     {
 757       if (COMPARISON_CLASS_P (may_be_zero))
 758         {
 759           /* Try to combine may_be_zero with assumptions, this can simplify
 760              computation of niter expression.  */
 761           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 762             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 763                                              niter_assumptions,
 764                                              fold_build1 (TRUTH_NOT_EXPR,
 765                                                           boolean_type_node,
 766                                                           may_be_zero));
 767           else
 768             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 769                                  build_int_cst (TREE_TYPE (niter), 0),
 770                                  rewrite_to_non_trapping_overflow (niter));
 771
 772           may_be_zero = NULL_TREE;
 773         }
 774       else if (integer_nonzerop (may_be_zero))
 775         {
 776           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 777           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 778           return cond;
 779         }
 780       else
 781         return cond;
 782     }
 783
 784   *assumptions = niter_assumptions;
 785   *number_of_iterationsm1 = niter;
 786
 787   /* We want the number of loop header executions which is the number
 788      of latch executions plus one.
 789      ???  For UINT_MAX latch executions this number overflows to zero
 790      for loops like do { n++; } while (n != 0);  */
 791   if (niter && !chrec_contains_undetermined (niter))
 792     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 793                           build_int_cst (TREE_TYPE (niter), 1));
 794   *number_of_iterations = niter;
 795
 796   return cond;
 797 }
 798
 799 /* Function bb_in_loop_p
 800
 801    Used as predicate for dfs order traversal of the loop bbs.  */
 802
 803 static bool
 804 bb_in_loop_p (const_basic_block bb, const void *data)
 805 {
 806   const class loop *const loop = (const class loop *)data;
 807   if (flow_bb_inside_loop_p (loop, bb))
 808     return true;
 809   return false;
 810 }
 811
 812
 813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 814    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 815
 816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 817   : vec_info (vec_info::loop, init_cost (loop_in, false), shared),
 818     loop (loop_in),
 819     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 820     num_itersm1 (NULL_TREE),
 821     num_iters (NULL_TREE),
 822     num_iters_unchanged (NULL_TREE),
 823     num_iters_assumptions (NULL_TREE),
 824     th (0),
 825     versioning_threshold (0),
 826     vectorization_factor (0),
 827     main_loop_edge (nullptr),
 828     skip_main_loop_edge (nullptr),
 829     skip_this_loop_edge (nullptr),
 830     reusable_accumulators (),
 831     max_vectorization_factor (0),
 832     mask_skip_niters (NULL_TREE),
 833     rgroup_compare_type (NULL_TREE),
 834     simd_if_cond (NULL_TREE),
 835     unaligned_dr (NULL),
 836     peeling_for_alignment (0),
 837     ptr_mask (0),
 838     ivexpr_map (NULL),
 839     scan_map (NULL),
 840     slp_unrolling_factor (1),
 841     single_scalar_iteration_cost (0),
 842     vec_outside_cost (0),
 843     vec_inside_cost (0),
 844     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 845     vectorizable (false),
 846     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 847     using_partial_vectors_p (false),
 848     epil_using_partial_vectors_p (false),
 849     peeling_for_gaps (false),
 850     peeling_for_niter (false),
 851     no_data_dependencies (false),
 852     has_mask_store (false),
 853     scalar_loop_scaling (profile_probability::uninitialized ()),
 854     scalar_loop (NULL),
 855     orig_loop_info (NULL)
 856 {
 857   /* CHECKME: We want to visit all BBs before their successors (except for
 858      latch blocks, for which this assertion wouldn't hold).  In the simple
 859      case of the loop forms we allow, a dfs order of the BBs would the same
 860      as reversed postorder traversal, so we are safe.  */
 861
 862   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 863                                           bbs, loop->num_nodes, loop);
 864   gcc_assert (nbbs == loop->num_nodes);
 865
 866   for (unsigned int i = 0; i < nbbs; i++)
 867     {
 868       basic_block bb = bbs[i];
 869       gimple_stmt_iterator si;
 870
 871       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 872         {
 873           gimple *phi = gsi_stmt (si);
 874           gimple_set_uid (phi, 0);
 875           add_stmt (phi);
 876         }
 877
 878       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879         {
 880           gimple *stmt = gsi_stmt (si);
 881           gimple_set_uid (stmt, 0);
 882           if (is_gimple_debug (stmt))
 883             continue;
 884           add_stmt (stmt);
 885           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 886              third argument is the #pragma omp simd if (x) condition, when 0,
 887              loop shouldn't be vectorized, when non-zero constant, it should
 888              be vectorized normally, otherwise versioned with vectorized loop
 889              done if the condition is non-zero at runtime.  */
 890           if (loop_in->simduid
 891               && is_gimple_call (stmt)
 892               && gimple_call_internal_p (stmt)
 893               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 894               && gimple_call_num_args (stmt) >= 3
 895               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 896               && (loop_in->simduid
 897                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 898             {
 899               tree arg = gimple_call_arg (stmt, 2);
 900               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 901                 simd_if_cond = arg;
 902               else
 903                 gcc_assert (integer_nonzerop (arg));
 904             }
 905         }
 906     }
 907
 908   epilogue_vinfos.create (6);
 909 }
 910
 911 /* Free all levels of rgroup CONTROLS.  */
 912
 913 void
 914 release_vec_loop_controls (vec<rgroup_controls> *controls)
 915 {
 916   rgroup_controls *rgc;
 917   unsigned int i;
 918   FOR_EACH_VEC_ELT (*controls, i, rgc)
 919     rgc->controls.release ();
 920   controls->release ();
 921 }
 922
 923 /* Free all memory used by the _loop_vec_info, as well as all the
 924    stmt_vec_info structs of all the stmts in the loop.  */
 925
 926 _loop_vec_info::~_loop_vec_info ()
 927 {
 928   free (bbs);
 929
 930   release_vec_loop_controls (&masks);
 931   release_vec_loop_controls (&lens);
 932   delete ivexpr_map;
 933   delete scan_map;
 934   epilogue_vinfos.release ();
 935
 936   /* When we release an epiloge vinfo that we do not intend to use
 937      avoid clearing AUX of the main loop which should continue to
 938      point to the main loop vinfo since otherwise we'll leak that.  */
 939   if (loop->aux == this)
 940     loop->aux = NULL;
 941 }
 942
 943 /* Return an invariant or register for EXPR and emit necessary
 944    computations in the LOOP_VINFO loop preheader.  */
 945
 946 tree
 947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 948 {
 949   if (is_gimple_reg (expr)
 950       || is_gimple_min_invariant (expr))
 951     return expr;
 952
 953   if (! loop_vinfo->ivexpr_map)
 954     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 955   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 956   if (! cached)
 957     {
 958       gimple_seq stmts = NULL;
 959       cached = force_gimple_operand (unshare_expr (expr),
 960                                      &stmts, true, NULL_TREE);
 961       if (stmts)
 962         {
 963           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 964           gsi_insert_seq_on_edge_immediate (e, stmts);
 965         }
 966     }
 967   return cached;
 968 }
 969
 970 /* Return true if we can use CMP_TYPE as the comparison type to produce
 971    all masks required to mask LOOP_VINFO.  */
 972
 973 static bool
 974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 975 {
 976   rgroup_controls *rgm;
 977   unsigned int i;
 978   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 979     if (rgm->type != NULL_TREE
 980         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 981                                             cmp_type, rgm->type,
 982                                             OPTIMIZE_FOR_SPEED))
 983       return false;
 984   return true;
 985 }
 986
 987 /* Calculate the maximum number of scalars per iteration for every
 988    rgroup in LOOP_VINFO.  */
 989
 990 static unsigned int
 991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 992 {
 993   unsigned int res = 1;
 994   unsigned int i;
 995   rgroup_controls *rgm;
 996   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 997     res = MAX (res, rgm->max_nscalars_per_iter);
 998   return res;
 999 }
1000
1001 /* Calculate the minimum precision necessary to represent:
1002
1003       MAX_NITERS * FACTOR
1004
1005    as an unsigned integer, where MAX_NITERS is the maximum number of
1006    loop header iterations for the original scalar form of LOOP_VINFO.  */
1007
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1010 {
1011   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012
1013   /* Get the maximum number of iterations that is representable
1014      in the counter type.  */
1015   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1017
1018   /* Get a more refined estimate for the number of iterations.  */
1019   widest_int max_back_edges;
1020   if (max_loop_iterations (loop, &max_back_edges))
1021     max_ni = wi::smin (max_ni, max_back_edges + 1);
1022
1023   /* Work out how many bits we need to represent the limit.  */
1024   return wi::min_precision (max_ni * factor, UNSIGNED);
1025 }
1026
1027 /* True if the loop needs peeling or partial vectors when vectorized.  */
1028
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1031 {
1032   unsigned HOST_WIDE_INT const_vf;
1033   HOST_WIDE_INT max_niter
1034     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1035
1036   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039                                           (loop_vinfo));
1040
1041   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1043     {
1044       /* Work out the (constant) number of iterations that need to be
1045          peeled for reasons other than niters.  */
1046       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048         peel_niter += 1;
1049       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051         return true;
1052     }
1053   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054       /* ??? When peeling for gaps but not alignment, we could
1055          try to check whether the (variable) niters is known to be
1056          VF * N + 1.  That's something of a niche case though.  */
1057       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060            < (unsigned) exact_log2 (const_vf))
1061           /* In case of versioning, check if the maximum number of
1062              iterations is greater than th.  If they are identical,
1063              the epilogue is unnecessary.  */
1064           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065               || ((unsigned HOST_WIDE_INT) max_niter
1066                   > (th / const_vf) * const_vf))))
1067     return true;
1068
1069   return false;
1070 }
1071
1072 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1073    whether we can actually generate the masks required.  Return true if so,
1074    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1075
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1078 {
1079   unsigned int min_ni_width;
1080   unsigned int max_nscalars_per_iter
1081     = vect_get_max_nscalars_per_iter (loop_vinfo);
1082
1083   /* Use a normal loop if there are no statements that need masking.
1084      This only happens in rare degenerate cases: it means that the loop
1085      has no loads, no stores, and no live-out values.  */
1086   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087     return false;
1088
1089   /* Work out how many bits we need to represent the limit.  */
1090   min_ni_width
1091     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1092
1093   /* Find a scalar mode for which WHILE_ULT is supported.  */
1094   opt_scalar_int_mode cmp_mode_iter;
1095   tree cmp_type = NULL_TREE;
1096   tree iv_type = NULL_TREE;
1097   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098   unsigned int iv_precision = UINT_MAX;
1099
1100   if (iv_limit != -1)
1101     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102                                       UNSIGNED);
1103
1104   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1105     {
1106       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107       if (cmp_bits >= min_ni_width
1108           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1109         {
1110           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111           if (this_type
1112               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1113             {
1114               /* Although we could stop as soon as we find a valid mode,
1115                  there are at least two reasons why that's not always the
1116                  best choice:
1117
1118                  - An IV that's Pmode or wider is more likely to be reusable
1119                    in address calculations than an IV that's narrower than
1120                    Pmode.
1121
1122                  - Doing the comparison in IV_PRECISION or wider allows
1123                    a natural 0-based IV, whereas using a narrower comparison
1124                    type requires mitigations against wrap-around.
1125
1126                  Conversely, if the IV limit is variable, doing the comparison
1127                  in a wider type than the original type can introduce
1128                  unnecessary extensions, so picking the widest valid mode
1129                  is not always a good choice either.
1130
1131                  Here we prefer the first IV type that's Pmode or wider,
1132                  and the first comparison type that's IV_PRECISION or wider.
1133                  (The comparison type must be no wider than the IV type,
1134                  to avoid extensions in the vector loop.)
1135
1136                  ??? We might want to try continuing beyond Pmode for ILP32
1137                  targets if CMP_BITS < IV_PRECISION.  */
1138               iv_type = this_type;
1139               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140                 cmp_type = this_type;
1141               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142                 break;
1143             }
1144         }
1145     }
1146
1147   if (!cmp_type)
1148     return false;
1149
1150   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152   return true;
1153 }
1154
1155 /* Check whether we can use vector access with length based on precison
1156    comparison.  So far, to keep it simple, we only allow the case that the
1157    precision of the target supported length is larger than the precision
1158    required by loop niters.  */
1159
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1162 {
1163   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164     return false;
1165
1166   unsigned int max_nitems_per_iter = 1;
1167   unsigned int i;
1168   rgroup_controls *rgl;
1169   /* Find the maximum number of items per iteration for every rgroup.  */
1170   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1171     {
1172       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1174     }
1175
1176   /* Work out how many bits we need to represent the length limit.  */
1177   unsigned int min_ni_prec
1178     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1179
1180   /* Now use the maximum of below precisions for one suitable IV type:
1181      - the IV's natural precision
1182      - the precision needed to hold: the maximum number of scalar
1183        iterations multiplied by the scale factor (min_ni_prec above)
1184      - the Pmode precision
1185
1186      If min_ni_prec is less than the precision of the current niters,
1187      we perfer to still use the niters type.  Prefer to use Pmode and
1188      wider IV to avoid narrow conversions.  */
1189
1190   unsigned int ni_prec
1191     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192   min_ni_prec = MAX (min_ni_prec, ni_prec);
1193   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1194
1195   tree iv_type = NULL_TREE;
1196   opt_scalar_int_mode tmode_iter;
1197   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1198     {
1199       scalar_mode tmode = tmode_iter.require ();
1200       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1201
1202       /* ??? Do we really want to construct one IV whose precision exceeds
1203          BITS_PER_WORD?  */
1204       if (tbits > BITS_PER_WORD)
1205         break;
1206
1207       /* Find the first available standard integral type.  */
1208       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1209         {
1210           iv_type = build_nonstandard_integer_type (tbits, true);
1211           break;
1212         }
1213     }
1214
1215   if (!iv_type)
1216     {
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                          "can't vectorize with length-based partial vectors"
1220                          " because there is no suitable iv type.\n");
1221       return false;
1222     }
1223
1224   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1226
1227   return true;
1228 }
1229
1230 /* Calculate the cost of one scalar iteration of the loop.  */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1233 {
1234   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236   int nbbs = loop->num_nodes, factor;
1237   int innerloop_iters, i;
1238
1239   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1240
1241   /* Gather costs for statements in the scalar loop.  */
1242
1243   /* FORNOW.  */
1244   innerloop_iters = 1;
1245   if (loop->inner)
1246     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1247
1248   for (i = 0; i < nbbs; i++)
1249     {
1250       gimple_stmt_iterator si;
1251       basic_block bb = bbs[i];
1252
1253       if (bb->loop_father == loop->inner)
1254         factor = innerloop_iters;
1255       else
1256         factor = 1;
1257
1258       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1259         {
1260           gimple *stmt = gsi_stmt (si);
1261           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1262
1263           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264             continue;
1265
1266           /* Skip stmts that are not vectorized inside the loop.  */
1267           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269               && (!STMT_VINFO_LIVE_P (vstmt_info)
1270                   || !VECTORIZABLE_CYCLE_DEF
1271                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272             continue;
1273
1274           vect_cost_for_stmt kind;
1275           if (STMT_VINFO_DATA_REF (stmt_info))
1276             {
1277               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278                kind = scalar_load;
1279              else
1280                kind = scalar_store;
1281             }
1282           else if (vect_nop_conversion_p (stmt_info))
1283             continue;
1284           else
1285             kind = scalar_stmt;
1286
1287           /* We are using vect_prologue here to avoid scaling twice
1288              by the inner loop factor.  */
1289           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1290                             factor, kind, stmt_info, 0, vect_prologue);
1291         }
1292     }
1293
1294   /* Now accumulate cost.  */
1295   void *target_cost_data = init_cost (loop, true);
1296   stmt_info_for_cost *si;
1297   int j;
1298   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1299                     j, si)
1300     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1301                           si->kind, si->stmt_info, si->vectype,
1302                           si->misalign, si->where);
1303   unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
1304   finish_cost (target_cost_data, &prologue_cost, &body_cost,
1305                &epilogue_cost);
1306   destroy_cost_data (target_cost_data);
1307   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1308     = prologue_cost + body_cost + epilogue_cost;
1309 }
1310
1311
1312 /* Function vect_analyze_loop_form_1.
1313
1314    Verify that certain CFG restrictions hold, including:
1315    - the loop has a pre-header
1316    - the loop has a single entry and exit
1317    - the loop exit condition is simple enough
1318    - the number of iterations can be analyzed, i.e, a countable loop.  The
1319      niter could be analyzed under some assumptions.  */
1320
1321 opt_result
1322 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1323                           tree *assumptions, tree *number_of_iterationsm1,
1324                           tree *number_of_iterations, gcond **inner_loop_cond)
1325 {
1326   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1327
1328   /* Different restrictions apply when we are considering an inner-most loop,
1329      vs. an outer (nested) loop.
1330      (FORNOW. May want to relax some of these restrictions in the future).  */
1331
1332   if (!loop->inner)
1333     {
1334       /* Inner-most loop.  We currently require that the number of BBs is
1335          exactly 2 (the header and latch).  Vectorizable inner-most loops
1336          look like this:
1337
1338                         (pre-header)
1339                            |
1340                           header <--------+
1341                            | |            |
1342                            | +--> latch --+
1343                            |
1344                         (exit-bb)  */
1345
1346       if (loop->num_nodes != 2)
1347         return opt_result::failure_at (vect_location,
1348                                        "not vectorized:"
1349                                        " control flow in loop.\n");
1350
1351       if (empty_block_p (loop->header))
1352         return opt_result::failure_at (vect_location,
1353                                        "not vectorized: empty loop.\n");
1354     }
1355   else
1356     {
1357       class loop *innerloop = loop->inner;
1358       edge entryedge;
1359
1360       /* Nested loop. We currently require that the loop is doubly-nested,
1361          contains a single inner loop, and the number of BBs is exactly 5.
1362          Vectorizable outer-loops look like this:
1363
1364                         (pre-header)
1365                            |
1366                           header <---+
1367                            |         |
1368                           inner-loop |
1369                            |         |
1370                           tail ------+
1371                            |
1372                         (exit-bb)
1373
1374          The inner-loop has the properties expected of inner-most loops
1375          as described above.  */
1376
1377       if ((loop->inner)->inner || (loop->inner)->next)
1378         return opt_result::failure_at (vect_location,
1379                                        "not vectorized:"
1380                                        " multiple nested loops.\n");
1381
1382       if (loop->num_nodes != 5)
1383         return opt_result::failure_at (vect_location,
1384                                        "not vectorized:"
1385                                        " control flow in loop.\n");
1386
1387       entryedge = loop_preheader_edge (innerloop);
1388       if (entryedge->src != loop->header
1389           || !single_exit (innerloop)
1390           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1391         return opt_result::failure_at (vect_location,
1392                                        "not vectorized:"
1393                                        " unsupported outerloop form.\n");
1394
1395       /* Analyze the inner-loop.  */
1396       tree inner_niterm1, inner_niter, inner_assumptions;
1397       opt_result res
1398         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1399                                     &inner_assumptions, &inner_niterm1,
1400                                     &inner_niter, NULL);
1401       if (!res)
1402         {
1403           if (dump_enabled_p ())
1404             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1405                              "not vectorized: Bad inner loop.\n");
1406           return res;
1407         }
1408
1409       /* Don't support analyzing niter under assumptions for inner
1410          loop.  */
1411       if (!integer_onep (inner_assumptions))
1412         return opt_result::failure_at (vect_location,
1413                                        "not vectorized: Bad inner loop.\n");
1414
1415       if (!expr_invariant_in_loop_p (loop, inner_niter))
1416         return opt_result::failure_at (vect_location,
1417                                        "not vectorized: inner-loop count not"
1418                                        " invariant.\n");
1419
1420       if (dump_enabled_p ())
1421         dump_printf_loc (MSG_NOTE, vect_location,
1422                          "Considering outer-loop vectorization.\n");
1423     }
1424
1425   if (!single_exit (loop))
1426     return opt_result::failure_at (vect_location,
1427                                    "not vectorized: multiple exits.\n");
1428   if (EDGE_COUNT (loop->header->preds) != 2)
1429     return opt_result::failure_at (vect_location,
1430                                    "not vectorized:"
1431                                    " too many incoming edges.\n");
1432
1433   /* We assume that the loop exit condition is at the end of the loop. i.e,
1434      that the loop is represented as a do-while (with a proper if-guard
1435      before the loop if needed), where the loop header contains all the
1436      executable statements, and the latch is empty.  */
1437   if (!empty_block_p (loop->latch)
1438       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1439     return opt_result::failure_at (vect_location,
1440                                    "not vectorized: latch block not empty.\n");
1441
1442   /* Make sure the exit is not abnormal.  */
1443   edge e = single_exit (loop);
1444   if (e->flags & EDGE_ABNORMAL)
1445     return opt_result::failure_at (vect_location,
1446                                    "not vectorized:"
1447                                    " abnormal loop exit edge.\n");
1448
1449   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1450                                      number_of_iterationsm1);
1451   if (!*loop_cond)
1452     return opt_result::failure_at
1453       (vect_location,
1454        "not vectorized: complicated exit condition.\n");
1455
1456   if (integer_zerop (*assumptions)
1457       || !*number_of_iterations
1458       || chrec_contains_undetermined (*number_of_iterations))
1459     return opt_result::failure_at
1460       (*loop_cond,
1461        "not vectorized: number of iterations cannot be computed.\n");
1462
1463   if (integer_zerop (*number_of_iterations))
1464     return opt_result::failure_at
1465       (*loop_cond,
1466        "not vectorized: number of iterations = 0.\n");
1467
1468   return opt_result::success ();
1469 }
1470
1471 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1472
1473 opt_loop_vec_info
1474 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1475 {
1476   tree assumptions, number_of_iterations, number_of_iterationsm1;
1477   gcond *loop_cond, *inner_loop_cond = NULL;
1478
1479   opt_result res
1480     = vect_analyze_loop_form_1 (loop, &loop_cond,
1481                                 &assumptions, &number_of_iterationsm1,
1482                                 &number_of_iterations, &inner_loop_cond);
1483   if (!res)
1484     return opt_loop_vec_info::propagate_failure (res);
1485
1486   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1487   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1488   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1489   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1490   if (!integer_onep (assumptions))
1491     {
1492       /* We consider to vectorize this loop by versioning it under
1493          some assumptions.  In order to do this, we need to clear
1494          existing information computed by scev and niter analyzer.  */
1495       scev_reset_htab ();
1496       free_numbers_of_iterations_estimates (loop);
1497       /* Also set flag for this loop so that following scev and niter
1498          analysis are done under the assumptions.  */
1499       loop_constraint_set (loop, LOOP_C_FINITE);
1500       /* Also record the assumptions for versioning.  */
1501       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1502     }
1503
1504   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1505     {
1506       if (dump_enabled_p ())
1507         {
1508           dump_printf_loc (MSG_NOTE, vect_location,
1509                            "Symbolic number of iterations is ");
1510           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1511           dump_printf (MSG_NOTE, "\n");
1512         }
1513     }
1514
1515   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1516   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1517   if (inner_loop_cond)
1518     {
1519       stmt_vec_info inner_loop_cond_info
1520         = loop_vinfo->lookup_stmt (inner_loop_cond);
1521       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1522       /* If we have an estimate on the number of iterations of the inner
1523          loop use that to limit the scale for costing, otherwise use
1524          --param vect-inner-loop-cost-factor literally.  */
1525       widest_int nit;
1526       if (estimated_stmt_executions (loop->inner, &nit))
1527         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1528           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1529     }
1530
1531   gcc_assert (!loop->aux);
1532   loop->aux = loop_vinfo;
1533   return opt_loop_vec_info::success (loop_vinfo);
1534 }
1535
1536
1537
1538 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1539    statements update the vectorization factor.  */
1540
1541 static void
1542 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1543 {
1544   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1545   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1546   int nbbs = loop->num_nodes;
1547   poly_uint64 vectorization_factor;
1548   int i;
1549
1550   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1551
1552   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1553   gcc_assert (known_ne (vectorization_factor, 0U));
1554
1555   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1556      vectorization factor of the loop is the unrolling factor required by
1557      the SLP instances.  If that unrolling factor is 1, we say, that we
1558      perform pure SLP on loop - cross iteration parallelism is not
1559      exploited.  */
1560   bool only_slp_in_loop = true;
1561   for (i = 0; i < nbbs; i++)
1562     {
1563       basic_block bb = bbs[i];
1564       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1565            gsi_next (&si))
1566         {
1567           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1568           if (!stmt_info)
1569             continue;
1570           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1571                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1572               && !PURE_SLP_STMT (stmt_info))
1573             /* STMT needs both SLP and loop-based vectorization.  */
1574             only_slp_in_loop = false;
1575         }
1576       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1577            gsi_next (&si))
1578         {
1579           if (is_gimple_debug (gsi_stmt (si)))
1580             continue;
1581           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1582           stmt_info = vect_stmt_to_vectorize (stmt_info);
1583           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1584                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1585               && !PURE_SLP_STMT (stmt_info))
1586             /* STMT needs both SLP and loop-based vectorization.  */
1587             only_slp_in_loop = false;
1588         }
1589     }
1590
1591   if (only_slp_in_loop)
1592     {
1593       if (dump_enabled_p ())
1594         dump_printf_loc (MSG_NOTE, vect_location,
1595                          "Loop contains only SLP stmts\n");
1596       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1597     }
1598   else
1599     {
1600       if (dump_enabled_p ())
1601         dump_printf_loc (MSG_NOTE, vect_location,
1602                          "Loop contains SLP and non-SLP stmts\n");
1603       /* Both the vectorization factor and unroll factor have the form
1604          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1605          so they must have a common multiple.  */
1606       vectorization_factor
1607         = force_common_multiple (vectorization_factor,
1608                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1609     }
1610
1611   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1612   if (dump_enabled_p ())
1613     {
1614       dump_printf_loc (MSG_NOTE, vect_location,
1615                        "Updating vectorization factor to ");
1616       dump_dec (MSG_NOTE, vectorization_factor);
1617       dump_printf (MSG_NOTE, ".\n");
1618     }
1619 }
1620
1621 /* Return true if STMT_INFO describes a double reduction phi and if
1622    the other phi in the reduction is also relevant for vectorization.
1623    This rejects cases such as:
1624
1625       outer1:
1626         x_1 = PHI <x_3(outer2), ...>;
1627         ...
1628
1629       inner:
1630         x_2 = ...;
1631         ...
1632
1633       outer2:
1634         x_3 = PHI <x_2(inner)>;
1635
1636    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1637
1638 static bool
1639 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1640 {
1641   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1642     return false;
1643
1644   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1645 }
1646
1647 /* Function vect_analyze_loop_operations.
1648
1649    Scan the loop stmts and make sure they are all vectorizable.  */
1650
1651 static opt_result
1652 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1653 {
1654   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1655   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1656   int nbbs = loop->num_nodes;
1657   int i;
1658   stmt_vec_info stmt_info;
1659   bool need_to_vectorize = false;
1660   bool ok;
1661
1662   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1663
1664   auto_vec<stmt_info_for_cost> cost_vec;
1665
1666   for (i = 0; i < nbbs; i++)
1667     {
1668       basic_block bb = bbs[i];
1669
1670       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1671            gsi_next (&si))
1672         {
1673           gphi *phi = si.phi ();
1674           ok = true;
1675
1676           stmt_info = loop_vinfo->lookup_stmt (phi);
1677           if (dump_enabled_p ())
1678             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1679           if (virtual_operand_p (gimple_phi_result (phi)))
1680             continue;
1681
1682           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1683              (i.e., a phi in the tail of the outer-loop).  */
1684           if (! is_loop_header_bb_p (bb))
1685             {
1686               /* FORNOW: we currently don't support the case that these phis
1687                  are not used in the outerloop (unless it is double reduction,
1688                  i.e., this phi is vect_reduction_def), cause this case
1689                  requires to actually do something here.  */
1690               if (STMT_VINFO_LIVE_P (stmt_info)
1691                   && !vect_active_double_reduction_p (stmt_info))
1692                 return opt_result::failure_at (phi,
1693                                                "Unsupported loop-closed phi"
1694                                                " in outer-loop.\n");
1695
1696               /* If PHI is used in the outer loop, we check that its operand
1697                  is defined in the inner loop.  */
1698               if (STMT_VINFO_RELEVANT_P (stmt_info))
1699                 {
1700                   tree phi_op;
1701
1702                   if (gimple_phi_num_args (phi) != 1)
1703                     return opt_result::failure_at (phi, "unsupported phi");
1704
1705                   phi_op = PHI_ARG_DEF (phi, 0);
1706                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1707                   if (!op_def_info)
1708                     return opt_result::failure_at (phi, "unsupported phi\n");
1709
1710                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1711                       && (STMT_VINFO_RELEVANT (op_def_info)
1712                           != vect_used_in_outer_by_reduction))
1713                     return opt_result::failure_at (phi, "unsupported phi\n");
1714
1715                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1716                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1717                            == vect_double_reduction_def))
1718                       && !vectorizable_lc_phi (loop_vinfo,
1719                                                stmt_info, NULL, NULL))
1720                     return opt_result::failure_at (phi, "unsupported phi\n");
1721                 }
1722
1723               continue;
1724             }
1725
1726           gcc_assert (stmt_info);
1727
1728           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1729                || STMT_VINFO_LIVE_P (stmt_info))
1730               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1731             /* A scalar-dependence cycle that we don't support.  */
1732             return opt_result::failure_at (phi,
1733                                            "not vectorized:"
1734                                            " scalar dependence cycle.\n");
1735
1736           if (STMT_VINFO_RELEVANT_P (stmt_info))
1737             {
1738               need_to_vectorize = true;
1739               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1740                   && ! PURE_SLP_STMT (stmt_info))
1741                 ok = vectorizable_induction (loop_vinfo,
1742                                              stmt_info, NULL, NULL,
1743                                              &cost_vec);
1744               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1745                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1746                             == vect_double_reduction_def)
1747                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1748                        && ! PURE_SLP_STMT (stmt_info))
1749                 ok = vectorizable_reduction (loop_vinfo,
1750                                              stmt_info, NULL, NULL, &cost_vec);
1751             }
1752
1753           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1754           if (ok
1755               && STMT_VINFO_LIVE_P (stmt_info)
1756               && !PURE_SLP_STMT (stmt_info))
1757             ok = vectorizable_live_operation (loop_vinfo,
1758                                               stmt_info, NULL, NULL, NULL,
1759                                               -1, false, &cost_vec);
1760
1761           if (!ok)
1762             return opt_result::failure_at (phi,
1763                                            "not vectorized: relevant phi not "
1764                                            "supported: %G",
1765                                            static_cast <gimple *> (phi));
1766         }
1767
1768       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1769            gsi_next (&si))
1770         {
1771           gimple *stmt = gsi_stmt (si);
1772           if (!gimple_clobber_p (stmt)
1773               && !is_gimple_debug (stmt))
1774             {
1775               opt_result res
1776                 = vect_analyze_stmt (loop_vinfo,
1777                                      loop_vinfo->lookup_stmt (stmt),
1778                                      &need_to_vectorize,
1779                                      NULL, NULL, &cost_vec);
1780               if (!res)
1781                 return res;
1782             }
1783         }
1784     } /* bbs */
1785
1786   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1787
1788   /* All operations in the loop are either irrelevant (deal with loop
1789      control, or dead), or only used outside the loop and can be moved
1790      out of the loop (e.g. invariants, inductions).  The loop can be
1791      optimized away by scalar optimizations.  We're better off not
1792      touching this loop.  */
1793   if (!need_to_vectorize)
1794     {
1795       if (dump_enabled_p ())
1796         dump_printf_loc (MSG_NOTE, vect_location,
1797                          "All the computation can be taken out of the loop.\n");
1798       return opt_result::failure_at
1799         (vect_location,
1800          "not vectorized: redundant loop. no profit to vectorize.\n");
1801     }
1802
1803   return opt_result::success ();
1804 }
1805
1806 /* Return true if we know that the iteration count is smaller than the
1807    vectorization factor.  Return false if it isn't, or if we can't be sure
1808    either way.  */
1809
1810 static bool
1811 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1812 {
1813   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1814
1815   HOST_WIDE_INT max_niter;
1816   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1817     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1818   else
1819     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1820
1821   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1822     return true;
1823
1824   return false;
1825 }
1826
1827 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1828    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1829    definitely no, or -1 if it's worth retrying.  */
1830
1831 static int
1832 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1833 {
1834   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1835   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1836
1837   /* Only loops that can handle partially-populated vectors can have iteration
1838      counts less than the vectorization factor.  */
1839   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1840     {
1841       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1842         {
1843           if (dump_enabled_p ())
1844             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1845                              "not vectorized: iteration count smaller than "
1846                              "vectorization factor.\n");
1847           return 0;
1848         }
1849     }
1850
1851   /* If using the "very cheap" model. reject cases in which we'd keep
1852      a copy of the scalar code (even if we might be able to vectorize it).  */
1853   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1854       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1855           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1856           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1857     {
1858       if (dump_enabled_p ())
1859         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860                          "some scalar iterations would need to be peeled\n");
1861       return 0;
1862     }
1863
1864   int min_profitable_iters, min_profitable_estimate;
1865   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1866                                       &min_profitable_estimate);
1867
1868   if (min_profitable_iters < 0)
1869     {
1870       if (dump_enabled_p ())
1871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                          "not vectorized: vectorization not profitable.\n");
1873       if (dump_enabled_p ())
1874         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                          "not vectorized: vector version will never be "
1876                          "profitable.\n");
1877       return -1;
1878     }
1879
1880   int min_scalar_loop_bound = (param_min_vect_loop_bound
1881                                * assumed_vf);
1882
1883   /* Use the cost model only if it is more conservative than user specified
1884      threshold.  */
1885   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1886                                     min_profitable_iters);
1887
1888   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1889
1890   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1891       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895                          "not vectorized: vectorization not profitable.\n");
1896       if (dump_enabled_p ())
1897         dump_printf_loc (MSG_NOTE, vect_location,
1898                          "not vectorized: iteration count smaller than user "
1899                          "specified loop bound parameter or minimum profitable "
1900                          "iterations (whichever is more conservative).\n");
1901       return 0;
1902     }
1903
1904   /* The static profitablity threshold min_profitable_estimate includes
1905      the cost of having to check at runtime whether the scalar loop
1906      should be used instead.  If it turns out that we don't need or want
1907      such a check, the threshold we should use for the static estimate
1908      is simply the point at which the vector loop becomes more profitable
1909      than the scalar loop.  */
1910   if (min_profitable_estimate > min_profitable_iters
1911       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1912       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1913       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1914       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1915     {
1916       if (dump_enabled_p ())
1917         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1918                          " choice between the scalar and vector loops\n");
1919       min_profitable_estimate = min_profitable_iters;
1920     }
1921
1922   /* If the vector loop needs multiple iterations to be beneficial then
1923      things are probably too close to call, and the conservative thing
1924      would be to stick with the scalar code.  */
1925   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1926       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1927     {
1928       if (dump_enabled_p ())
1929         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930                          "one iteration of the vector loop would be"
1931                          " more expensive than the equivalent number of"
1932                          " iterations of the scalar loop\n");
1933       return 0;
1934     }
1935
1936   HOST_WIDE_INT estimated_niter;
1937
1938   /* If we are vectorizing an epilogue then we know the maximum number of
1939      scalar iterations it will cover is at least one lower than the
1940      vectorization factor of the main loop.  */
1941   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1942     estimated_niter
1943       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1944   else
1945     {
1946       estimated_niter = estimated_stmt_executions_int (loop);
1947       if (estimated_niter == -1)
1948         estimated_niter = likely_max_stmt_executions_int (loop);
1949     }
1950   if (estimated_niter != -1
1951       && ((unsigned HOST_WIDE_INT) estimated_niter
1952           < MAX (th, (unsigned) min_profitable_estimate)))
1953     {
1954       if (dump_enabled_p ())
1955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956                          "not vectorized: estimated iteration count too "
1957                          "small.\n");
1958       if (dump_enabled_p ())
1959         dump_printf_loc (MSG_NOTE, vect_location,
1960                          "not vectorized: estimated iteration count smaller "
1961                          "than specified loop bound parameter or minimum "
1962                          "profitable iterations (whichever is more "
1963                          "conservative).\n");
1964       return -1;
1965     }
1966
1967   return 1;
1968 }
1969
1970 static opt_result
1971 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1972                            vec<data_reference_p> *datarefs,
1973                            unsigned int *n_stmts)
1974 {
1975   *n_stmts = 0;
1976   for (unsigned i = 0; i < loop->num_nodes; i++)
1977     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1978          !gsi_end_p (gsi); gsi_next (&gsi))
1979       {
1980         gimple *stmt = gsi_stmt (gsi);
1981         if (is_gimple_debug (stmt))
1982           continue;
1983         ++(*n_stmts);
1984         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1985                                                         NULL, 0);
1986         if (!res)
1987           {
1988             if (is_gimple_call (stmt) && loop->safelen)
1989               {
1990                 tree fndecl = gimple_call_fndecl (stmt), op;
1991                 if (fndecl != NULL_TREE)
1992                   {
1993                     cgraph_node *node = cgraph_node::get (fndecl);
1994                     if (node != NULL && node->simd_clones != NULL)
1995                       {
1996                         unsigned int j, n = gimple_call_num_args (stmt);
1997                         for (j = 0; j < n; j++)
1998                           {
1999                             op = gimple_call_arg (stmt, j);
2000                             if (DECL_P (op)
2001                                 || (REFERENCE_CLASS_P (op)
2002                                     && get_base_address (op)))
2003                               break;
2004                           }
2005                         op = gimple_call_lhs (stmt);
2006                         /* Ignore #pragma omp declare simd functions
2007                            if they don't have data references in the
2008                            call stmt itself.  */
2009                         if (j == n
2010                             && !(op
2011                                  && (DECL_P (op)
2012                                      || (REFERENCE_CLASS_P (op)
2013                                          && get_base_address (op)))))
2014                           continue;
2015                       }
2016                   }
2017               }
2018             return res;
2019           }
2020         /* If dependence analysis will give up due to the limit on the
2021            number of datarefs stop here and fail fatally.  */
2022         if (datarefs->length ()
2023             > (unsigned)param_loop_max_datarefs_for_datadeps)
2024           return opt_result::failure_at (stmt, "exceeded param "
2025                                          "loop-max-datarefs-for-datadeps\n");
2026       }
2027   return opt_result::success ();
2028 }
2029
2030 /* Look for SLP-only access groups and turn each individual access into its own
2031    group.  */
2032 static void
2033 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2034 {
2035   unsigned int i;
2036   struct data_reference *dr;
2037
2038   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2039
2040   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2041   FOR_EACH_VEC_ELT (datarefs, i, dr)
2042     {
2043       gcc_assert (DR_REF (dr));
2044       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2045
2046       /* Check if the load is a part of an interleaving chain.  */
2047       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048         {
2049           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2050           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2051           unsigned int group_size = DR_GROUP_SIZE (first_element);
2052
2053           /* Check if SLP-only groups.  */
2054           if (!STMT_SLP_TYPE (stmt_info)
2055               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2056             {
2057               /* Dissolve the group.  */
2058               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2059
2060               stmt_vec_info vinfo = first_element;
2061               while (vinfo)
2062                 {
2063                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2064                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2065                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2066                   DR_GROUP_SIZE (vinfo) = 1;
2067                   if (STMT_VINFO_STRIDED_P (first_element))
2068                     DR_GROUP_GAP (vinfo) = 0;
2069                   else
2070                     DR_GROUP_GAP (vinfo) = group_size - 1;
2071                   /* Duplicate and adjust alignment info, it needs to
2072                      be present on each group leader, see dr_misalignment.  */
2073                   if (vinfo != first_element)
2074                     {
2075                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2076                       dr_info2->target_alignment = dr_info->target_alignment;
2077                       int misalignment = dr_info->misalignment;
2078                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2079                         {
2080                           HOST_WIDE_INT diff
2081                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2082                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2083                           unsigned HOST_WIDE_INT align_c
2084                             = dr_info->target_alignment.to_constant ();
2085                           misalignment = (misalignment + diff) % align_c;
2086                         }
2087                       dr_info2->misalignment = misalignment;
2088                     }
2089                   vinfo = next;
2090                 }
2091             }
2092         }
2093     }
2094 }
2095
2096 /* Determine if operating on full vectors for LOOP_VINFO might leave
2097    some scalar iterations still to do.  If so, decide how we should
2098    handle those scalar iterations.  The possibilities are:
2099
2100    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2101        In this case:
2102
2103          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2104          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2105          LOOP_VINFO_PEELING_FOR_NITER == false
2106
2107    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2108        to handle the remaining scalar iterations.  In this case:
2109
2110          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2111          LOOP_VINFO_PEELING_FOR_NITER == true
2112
2113        There are two choices:
2114
2115        (2a) Consider vectorizing the epilogue loop at the same VF as the
2116             main loop, but using partial vectors instead of full vectors.
2117             In this case:
2118
2119               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2120
2121        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2122             In this case:
2123
2124               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2125
2126    When FOR_EPILOGUE_P is true, make this determination based on the
2127    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2128    based on the assumption that LOOP_VINFO is the main loop.  The caller
2129    has made sure that the number of iterations is set appropriately for
2130    this value of FOR_EPILOGUE_P.  */
2131
2132 opt_result
2133 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2134                                             bool for_epilogue_p)
2135 {
2136   /* Determine whether there would be any scalar iterations left over.  */
2137   bool need_peeling_or_partial_vectors_p
2138     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2139
2140   /* Decide whether to vectorize the loop with partial vectors.  */
2141   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2142   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2143   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2144       && need_peeling_or_partial_vectors_p)
2145     {
2146       /* For partial-vector-usage=1, try to push the handling of partial
2147          vectors to the epilogue, with the main loop continuing to operate
2148          on full vectors.
2149
2150          ??? We could then end up failing to use partial vectors if we
2151          decide to peel iterations into a prologue, and if the main loop
2152          then ends up processing fewer than VF iterations.  */
2153       if (param_vect_partial_vector_usage == 1
2154           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2155           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2156         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2157       else
2158         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2159     }
2160
2161   if (dump_enabled_p ())
2162     {
2163       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2164         dump_printf_loc (MSG_NOTE, vect_location,
2165                          "operating on partial vectors%s.\n",
2166                          for_epilogue_p ? " for epilogue loop" : "");
2167       else
2168         dump_printf_loc (MSG_NOTE, vect_location,
2169                          "operating only on full vectors%s.\n",
2170                          for_epilogue_p ? " for epilogue loop" : "");
2171     }
2172
2173   if (for_epilogue_p)
2174     {
2175       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2176       gcc_assert (orig_loop_vinfo);
2177       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2178         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2179                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2180     }
2181
2182   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2183       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2184     {
2185       /* Check that the loop processes at least one full vector.  */
2186       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2187       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2188       if (known_lt (wi::to_widest (scalar_niters), vf))
2189         return opt_result::failure_at (vect_location,
2190                                        "loop does not have enough iterations"
2191                                        " to support vectorization.\n");
2192
2193       /* If we need to peel an extra epilogue iteration to handle data
2194          accesses with gaps, check that there are enough scalar iterations
2195          available.
2196
2197          The check above is redundant with this one when peeling for gaps,
2198          but the distinction is useful for diagnostics.  */
2199       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2200       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2201           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2202         return opt_result::failure_at (vect_location,
2203                                        "loop does not have enough iterations"
2204                                        " to support peeling for gaps.\n");
2205     }
2206
2207   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2208     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2209        && need_peeling_or_partial_vectors_p);
2210
2211   return opt_result::success ();
2212 }
2213
2214 /* Function vect_analyze_loop_2.
2215
2216    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2217    for it.  The different analyses will record information in the
2218    loop_vec_info struct.  */
2219 static opt_result
2220 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2221 {
2222   opt_result ok = opt_result::success ();
2223   int res;
2224   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2225   poly_uint64 min_vf = 2;
2226   loop_vec_info orig_loop_vinfo = NULL;
2227
2228   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2229      loop_vec_info of the first vectorized loop.  */
2230   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2231     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2232   else
2233     orig_loop_vinfo = loop_vinfo;
2234   gcc_assert (orig_loop_vinfo);
2235
2236   /* The first group of checks is independent of the vector size.  */
2237   fatal = true;
2238
2239   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2240       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2241     return opt_result::failure_at (vect_location,
2242                                    "not vectorized: simd if(0)\n");
2243
2244   /* Find all data references in the loop (which correspond to vdefs/vuses)
2245      and analyze their evolution in the loop.  */
2246
2247   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2248
2249   /* Gather the data references and count stmts in the loop.  */
2250   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2251     {
2252       opt_result res
2253         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2254                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2255                                      n_stmts);
2256       if (!res)
2257         {
2258           if (dump_enabled_p ())
2259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2260                              "not vectorized: loop contains function "
2261                              "calls or data references that cannot "
2262                              "be analyzed\n");
2263           return res;
2264         }
2265       loop_vinfo->shared->save_datarefs ();
2266     }
2267   else
2268     loop_vinfo->shared->check_datarefs ();
2269
2270   /* Analyze the data references and also adjust the minimal
2271      vectorization factor according to the loads and stores.  */
2272
2273   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2274   if (!ok)
2275     {
2276       if (dump_enabled_p ())
2277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278                          "bad data references.\n");
2279       return ok;
2280     }
2281
2282   /* Classify all cross-iteration scalar data-flow cycles.
2283      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2284   vect_analyze_scalar_cycles (loop_vinfo);
2285
2286   vect_pattern_recog (loop_vinfo);
2287
2288   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2289
2290   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2291      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2292
2293   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2294   if (!ok)
2295     {
2296       if (dump_enabled_p ())
2297         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                          "bad data access.\n");
2299       return ok;
2300     }
2301
2302   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2303
2304   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                          "unexpected pattern.\n");
2310       return ok;
2311     }
2312
2313   /* While the rest of the analysis below depends on it in some way.  */
2314   fatal = false;
2315
2316   /* Analyze data dependences between the data-refs in the loop
2317      and adjust the maximum vectorization factor according to
2318      the dependences.
2319      FORNOW: fail at the first data dependence that we encounter.  */
2320
2321   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2322   if (!ok)
2323     {
2324       if (dump_enabled_p ())
2325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326                          "bad data dependence.\n");
2327       return ok;
2328     }
2329   if (max_vf != MAX_VECTORIZATION_FACTOR
2330       && maybe_lt (max_vf, min_vf))
2331     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2332   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2333
2334   ok = vect_determine_vectorization_factor (loop_vinfo);
2335   if (!ok)
2336     {
2337       if (dump_enabled_p ())
2338         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339                          "can't determine vectorization factor.\n");
2340       return ok;
2341     }
2342   if (max_vf != MAX_VECTORIZATION_FACTOR
2343       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2344     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2345
2346   /* Compute the scalar iteration cost.  */
2347   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2348
2349   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2350
2351   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2352   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2353   if (!ok)
2354     return ok;
2355
2356   /* If there are any SLP instances mark them as pure_slp.  */
2357   bool slp = vect_make_slp_decision (loop_vinfo);
2358   if (slp)
2359     {
2360       /* Find stmts that need to be both vectorized and SLPed.  */
2361       vect_detect_hybrid_slp (loop_vinfo);
2362
2363       /* Update the vectorization factor based on the SLP decision.  */
2364       vect_update_vf_for_slp (loop_vinfo);
2365
2366       /* Optimize the SLP graph with the vectorization factor fixed.  */
2367       vect_optimize_slp (loop_vinfo);
2368
2369       /* Gather the loads reachable from the SLP graph entries.  */
2370       vect_gather_slp_loads (loop_vinfo);
2371     }
2372
2373   bool saved_can_use_partial_vectors_p
2374     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2375
2376   /* We don't expect to have to roll back to anything other than an empty
2377      set of rgroups.  */
2378   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2379
2380   /* This is the point where we can re-start analysis with SLP forced off.  */
2381 start_over:
2382
2383   /* Now the vectorization factor is final.  */
2384   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2385   gcc_assert (known_ne (vectorization_factor, 0U));
2386
2387   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2388     {
2389       dump_printf_loc (MSG_NOTE, vect_location,
2390                        "vectorization_factor = ");
2391       dump_dec (MSG_NOTE, vectorization_factor);
2392       dump_printf (MSG_NOTE, ", niters = %wd\n",
2393                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2394     }
2395
2396   /* Analyze the alignment of the data-refs in the loop.
2397      Fail if a data reference is found that cannot be vectorized.  */
2398
2399   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2400   if (!ok)
2401     {
2402       if (dump_enabled_p ())
2403         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404                          "bad data alignment.\n");
2405       return ok;
2406     }
2407
2408   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2409      It is important to call pruning after vect_analyze_data_ref_accesses,
2410      since we use grouping information gathered by interleaving analysis.  */
2411   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2412   if (!ok)
2413     return ok;
2414
2415   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2416      vectorization, since we do not want to add extra peeling or
2417      add versioning for alignment.  */
2418   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2419     /* This pass will decide on using loop versioning and/or loop peeling in
2420        order to enhance the alignment of data references in the loop.  */
2421     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2422   if (!ok)
2423     return ok;
2424
2425   if (slp)
2426     {
2427       /* Analyze operations in the SLP instances.  Note this may
2428          remove unsupported SLP instances which makes the above
2429          SLP kind detection invalid.  */
2430       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2431       vect_slp_analyze_operations (loop_vinfo);
2432       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2433         {
2434           ok = opt_result::failure_at (vect_location,
2435                                        "unsupported SLP instances\n");
2436           goto again;
2437         }
2438
2439       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2440       slp_tree load_node, slp_root;
2441       unsigned i, x;
2442       slp_instance instance;
2443       bool can_use_lanes = true;
2444       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2445         {
2446           slp_root = SLP_INSTANCE_TREE (instance);
2447           int group_size = SLP_TREE_LANES (slp_root);
2448           tree vectype = SLP_TREE_VECTYPE (slp_root);
2449           bool loads_permuted = false;
2450           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2451             {
2452               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2453                 continue;
2454               unsigned j;
2455               stmt_vec_info load_info;
2456               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2457                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2458                   {
2459                     loads_permuted = true;
2460                     break;
2461                   }
2462             }
2463
2464           /* If the loads and stores can be handled with load/store-lane
2465              instructions record it and move on to the next instance.  */
2466           if (loads_permuted
2467               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2468               && vect_store_lanes_supported (vectype, group_size, false))
2469             {
2470               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2471                 {
2472                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2473                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2474                   /* Use SLP for strided accesses (or if we can't
2475                      load-lanes).  */
2476                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2477                       || ! vect_load_lanes_supported
2478                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2479                              DR_GROUP_SIZE (stmt_vinfo), false))
2480                     break;
2481                 }
2482
2483               can_use_lanes
2484                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2485
2486               if (can_use_lanes && dump_enabled_p ())
2487                 dump_printf_loc (MSG_NOTE, vect_location,
2488                                  "SLP instance %p can use load/store-lanes\n",
2489                                  instance);
2490             }
2491           else
2492             {
2493               can_use_lanes = false;
2494               break;
2495             }
2496         }
2497
2498       /* If all SLP instances can use load/store-lanes abort SLP and try again
2499          with SLP disabled.  */
2500       if (can_use_lanes)
2501         {
2502           ok = opt_result::failure_at (vect_location,
2503                                        "Built SLP cancelled: can use "
2504                                        "load/store-lanes\n");
2505           if (dump_enabled_p ())
2506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2507                              "Built SLP cancelled: all SLP instances support "
2508                              "load/store-lanes\n");
2509           goto again;
2510         }
2511     }
2512
2513   /* Dissolve SLP-only groups.  */
2514   vect_dissolve_slp_only_groups (loop_vinfo);
2515
2516   /* Scan all the remaining operations in the loop that are not subject
2517      to SLP and make sure they are vectorizable.  */
2518   ok = vect_analyze_loop_operations (loop_vinfo);
2519   if (!ok)
2520     {
2521       if (dump_enabled_p ())
2522         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523                          "bad operation or unsupported loop bound.\n");
2524       return ok;
2525     }
2526
2527   /* For now, we don't expect to mix both masking and length approaches for one
2528      loop, disable it if both are recorded.  */
2529   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2530       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2531       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2532     {
2533       if (dump_enabled_p ())
2534         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535                          "can't vectorize a loop with partial vectors"
2536                          " because we don't expect to mix different"
2537                          " approaches with partial vectors for the"
2538                          " same loop.\n");
2539       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2540     }
2541
2542   /* If we still have the option of using partial vectors,
2543      check whether we can generate the necessary loop controls.  */
2544   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2545       && !vect_verify_full_masking (loop_vinfo)
2546       && !vect_verify_loop_lens (loop_vinfo))
2547     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2548
2549   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2550      to be able to handle fewer than VF scalars, or needs to have a lower VF
2551      than the main loop.  */
2552   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2553       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2554       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2555                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2556     return opt_result::failure_at (vect_location,
2557                                    "Vectorization factor too high for"
2558                                    " epilogue loop.\n");
2559
2560   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2561      assuming that the loop will be used as a main loop.  We will redo
2562      this analysis later if we instead decide to use the loop as an
2563      epilogue loop.  */
2564   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2565   if (!ok)
2566     return ok;
2567
2568   /* Check the costings of the loop make vectorizing worthwhile.  */
2569   res = vect_analyze_loop_costing (loop_vinfo);
2570   if (res < 0)
2571     {
2572       ok = opt_result::failure_at (vect_location,
2573                                    "Loop costings may not be worthwhile.\n");
2574       goto again;
2575     }
2576   if (!res)
2577     return opt_result::failure_at (vect_location,
2578                                    "Loop costings not worthwhile.\n");
2579
2580   /* If an epilogue loop is required make sure we can create one.  */
2581   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2582       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2583     {
2584       if (dump_enabled_p ())
2585         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2586       if (!vect_can_advance_ivs_p (loop_vinfo)
2587           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2588                                            single_exit (LOOP_VINFO_LOOP
2589                                                          (loop_vinfo))))
2590         {
2591           ok = opt_result::failure_at (vect_location,
2592                                        "not vectorized: can't create required "
2593                                        "epilog loop\n");
2594           goto again;
2595         }
2596     }
2597
2598   /* During peeling, we need to check if number of loop iterations is
2599      enough for both peeled prolog loop and vector loop.  This check
2600      can be merged along with threshold check of loop versioning, so
2601      increase threshold for this case if necessary.
2602
2603      If we are analyzing an epilogue we still want to check what its
2604      versioning threshold would be.  If we decide to vectorize the epilogues we
2605      will want to use the lowest versioning threshold of all epilogues and main
2606      loop.  This will enable us to enter a vectorized epilogue even when
2607      versioning the loop.  We can't simply check whether the epilogue requires
2608      versioning though since we may have skipped some versioning checks when
2609      analyzing the epilogue.  For instance, checks for alias versioning will be
2610      skipped when dealing with epilogues as we assume we already checked them
2611      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2612   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2613     {
2614       poly_uint64 niters_th = 0;
2615       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2616
2617       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2618         {
2619           /* Niters for peeled prolog loop.  */
2620           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2621             {
2622               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2623               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2624               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2625             }
2626           else
2627             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2628         }
2629
2630       /* Niters for at least one iteration of vectorized loop.  */
2631       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2632         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633       /* One additional iteration because of peeling for gap.  */
2634       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2635         niters_th += 1;
2636
2637       /*  Use the same condition as vect_transform_loop to decide when to use
2638           the cost to determine a versioning threshold.  */
2639       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2640           && ordered_p (th, niters_th))
2641         niters_th = ordered_max (poly_uint64 (th), niters_th);
2642
2643       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2644     }
2645
2646   gcc_assert (known_eq (vectorization_factor,
2647                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2648
2649   /* Ok to vectorize!  */
2650   return opt_result::success ();
2651
2652 again:
2653   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2654   gcc_assert (!ok);
2655
2656   /* Try again with SLP forced off but if we didn't do any SLP there is
2657      no point in re-trying.  */
2658   if (!slp)
2659     return ok;
2660
2661   /* If there are reduction chains re-trying will fail anyway.  */
2662   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2663     return ok;
2664
2665   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2666      via interleaving or lane instructions.  */
2667   slp_instance instance;
2668   slp_tree node;
2669   unsigned i, j;
2670   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2671     {
2672       stmt_vec_info vinfo;
2673       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2674       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2675         continue;
2676       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2677       unsigned int size = DR_GROUP_SIZE (vinfo);
2678       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2679       if (! vect_store_lanes_supported (vectype, size, false)
2680          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2681          && ! vect_grouped_store_supported (vectype, size))
2682         return opt_result::failure_at (vinfo->stmt,
2683                                        "unsupported grouped store\n");
2684       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2685         {
2686           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2687           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2688           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2689           size = DR_GROUP_SIZE (vinfo);
2690           vectype = STMT_VINFO_VECTYPE (vinfo);
2691           if (! vect_load_lanes_supported (vectype, size, false)
2692               && ! vect_grouped_load_supported (vectype, single_element_p,
2693                                                 size))
2694             return opt_result::failure_at (vinfo->stmt,
2695                                            "unsupported grouped load\n");
2696         }
2697     }
2698
2699   if (dump_enabled_p ())
2700     dump_printf_loc (MSG_NOTE, vect_location,
2701                      "re-trying with SLP disabled\n");
2702
2703   /* Roll back state appropriately.  No SLP this time.  */
2704   slp = false;
2705   /* Restore vectorization factor as it were without SLP.  */
2706   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2707   /* Free the SLP instances.  */
2708   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2709     vect_free_slp_instance (instance);
2710   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2711   /* Reset SLP type to loop_vect on all stmts.  */
2712   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2713     {
2714       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2715       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2716            !gsi_end_p (si); gsi_next (&si))
2717         {
2718           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2719           STMT_SLP_TYPE (stmt_info) = loop_vect;
2720           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2721               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2722             {
2723               /* vectorizable_reduction adjusts reduction stmt def-types,
2724                  restore them to that of the PHI.  */
2725               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2726                 = STMT_VINFO_DEF_TYPE (stmt_info);
2727               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2728                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2729                 = STMT_VINFO_DEF_TYPE (stmt_info);
2730             }
2731         }
2732       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2733            !gsi_end_p (si); gsi_next (&si))
2734         {
2735           if (is_gimple_debug (gsi_stmt (si)))
2736             continue;
2737           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2738           STMT_SLP_TYPE (stmt_info) = loop_vect;
2739           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2740             {
2741               stmt_vec_info pattern_stmt_info
2742                 = STMT_VINFO_RELATED_STMT (stmt_info);
2743               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2744                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2745
2746               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2747               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2748               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2749                    !gsi_end_p (pi); gsi_next (&pi))
2750                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2751                   = loop_vect;
2752             }
2753         }
2754     }
2755   /* Free optimized alias test DDRS.  */
2756   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2757   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2758   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2759   /* Reset target cost data.  */
2760   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2761   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2762     = init_cost (LOOP_VINFO_LOOP (loop_vinfo), false);
2763   /* Reset accumulated rgroup information.  */
2764   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2765   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2766   /* Reset assorted flags.  */
2767   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2768   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2769   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2770   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2771   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2772     = saved_can_use_partial_vectors_p;
2773
2774   goto start_over;
2775 }
2776
2777 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2778    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2779    OLD_LOOP_VINFO is better unless something specifically indicates
2780    otherwise.
2781
2782    Note that this deliberately isn't a partial order.  */
2783
2784 static bool
2785 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2786                           loop_vec_info old_loop_vinfo)
2787 {
2788   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2789   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2790
2791   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2792   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2793
2794   /* Always prefer a VF of loop->simdlen over any other VF.  */
2795   if (loop->simdlen)
2796     {
2797       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2798       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2799       if (new_simdlen_p != old_simdlen_p)
2800         return new_simdlen_p;
2801     }
2802
2803   /* Limit the VFs to what is likely to be the maximum number of iterations,
2804      to handle cases in which at least one loop_vinfo is fully-masked.  */
2805   HOST_WIDE_INT estimated_max_niter;
2806   loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2807   unsigned HOST_WIDE_INT main_vf;
2808   if (main_loop
2809       && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2810       && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2811     estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2812   else
2813     estimated_max_niter = likely_max_stmt_executions_int (loop);
2814   if (estimated_max_niter != -1)
2815     {
2816       if (known_le (estimated_max_niter, new_vf))
2817         new_vf = estimated_max_niter;
2818       if (known_le (estimated_max_niter, old_vf))
2819         old_vf = estimated_max_niter;
2820     }
2821
2822   /* Check whether the (fractional) cost per scalar iteration is lower
2823      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2824   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2825   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2826
2827   HOST_WIDE_INT est_rel_new_min
2828     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2829   HOST_WIDE_INT est_rel_new_max
2830     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2831
2832   HOST_WIDE_INT est_rel_old_min
2833     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2834   HOST_WIDE_INT est_rel_old_max
2835     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2836
2837   /* Check first if we can make out an unambigous total order from the minimum
2838      and maximum estimates.  */
2839   if (est_rel_new_min < est_rel_old_min
2840       && est_rel_new_max < est_rel_old_max)
2841     return true;
2842   else if (est_rel_old_min < est_rel_new_min
2843            && est_rel_old_max < est_rel_new_max)
2844     return false;
2845   /* When old_loop_vinfo uses a variable vectorization factor,
2846      we know that it has a lower cost for at least one runtime VF.
2847      However, we don't know how likely that VF is.
2848
2849      One option would be to compare the costs for the estimated VFs.
2850      The problem is that that can put too much pressure on the cost
2851      model.  E.g. if the estimated VF is also the lowest possible VF,
2852      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2853      for the estimated VF, we'd then choose new_loop_vinfo even
2854      though (a) new_loop_vinfo might not actually be better than
2855      old_loop_vinfo for that VF and (b) it would be significantly
2856      worse at larger VFs.
2857
2858      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2859      no more expensive than old_loop_vinfo even after doubling the
2860      estimated old_loop_vinfo VF.  For all but trivial loops, this
2861      ensures that we only pick new_loop_vinfo if it is significantly
2862      better than old_loop_vinfo at the estimated VF.  */
2863
2864   if (est_rel_old_min != est_rel_new_min
2865       || est_rel_old_max != est_rel_new_max)
2866     {
2867       HOST_WIDE_INT est_rel_new_likely
2868         = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2869       HOST_WIDE_INT est_rel_old_likely
2870         = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2871
2872       return est_rel_new_likely * 2 <= est_rel_old_likely;
2873     }
2874
2875   /* If there's nothing to choose between the loop bodies, see whether
2876      there's a difference in the prologue and epilogue costs.  */
2877   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2878     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2879
2880   return false;
2881 }
2882
2883 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2884    true if we should.  */
2885
2886 static bool
2887 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2888                         loop_vec_info old_loop_vinfo)
2889 {
2890   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2891     return false;
2892
2893   if (dump_enabled_p ())
2894     dump_printf_loc (MSG_NOTE, vect_location,
2895                      "***** Preferring vector mode %s to vector mode %s\n",
2896                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2897                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2898   return true;
2899 }
2900
2901 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2902    try to reanalyze it as a main loop.  Return the loop_vinfo on success
2903    and null on failure.  */
2904
2905 static loop_vec_info
2906 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2907 {
2908   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2909     return loop_vinfo;
2910
2911   if (dump_enabled_p ())
2912     dump_printf_loc (MSG_NOTE, vect_location,
2913                      "***** Reanalyzing as a main loop with vector mode %s\n",
2914                      GET_MODE_NAME (loop_vinfo->vector_mode));
2915
2916   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2917   vec_info_shared *shared = loop_vinfo->shared;
2918   opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2919   gcc_assert (main_loop_vinfo);
2920
2921   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2922
2923   bool fatal = false;
2924   bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2925   loop->aux = NULL;
2926   if (!res)
2927     {
2928       if (dump_enabled_p ())
2929         dump_printf_loc (MSG_NOTE, vect_location,
2930                          "***** Failed to analyze main loop with vector"
2931                          " mode %s\n",
2932                          GET_MODE_NAME (loop_vinfo->vector_mode));
2933       delete main_loop_vinfo;
2934       return NULL;
2935     }
2936   LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2937   return main_loop_vinfo;
2938 }
2939
2940 /* Function vect_analyze_loop.
2941
2942    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2943    for it.  The different analyses will record information in the
2944    loop_vec_info struct.  */
2945 opt_loop_vec_info
2946 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2947 {
2948   auto_vector_modes vector_modes;
2949
2950   /* Autodetect first vector size we try.  */
2951   unsigned int autovec_flags
2952     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2953                                                     loop->simdlen != 0);
2954   unsigned int mode_i = 0;
2955
2956   DUMP_VECT_SCOPE ("analyze_loop_nest");
2957
2958   if (loop_outer (loop)
2959       && loop_vec_info_for_loop (loop_outer (loop))
2960       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2961     return opt_loop_vec_info::failure_at (vect_location,
2962                                           "outer-loop already vectorized.\n");
2963
2964   if (!find_loop_nest (loop, &shared->loop_nest))
2965     return opt_loop_vec_info::failure_at
2966       (vect_location,
2967        "not vectorized: loop nest containing two or more consecutive inner"
2968        " loops cannot be vectorized\n");
2969
2970   unsigned n_stmts = 0;
2971   machine_mode autodetected_vector_mode = VOIDmode;
2972   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2973   machine_mode next_vector_mode = VOIDmode;
2974   poly_uint64 lowest_th = 0;
2975   unsigned vectorized_loops = 0;
2976   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2977                              && !unlimited_cost_model (loop));
2978
2979   bool vect_epilogues = false;
2980   opt_result res = opt_result::success ();
2981   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2982   while (1)
2983     {
2984       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2985       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2986       if (!loop_vinfo)
2987         {
2988           if (dump_enabled_p ())
2989             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2990                              "bad loop form.\n");
2991           gcc_checking_assert (first_loop_vinfo == NULL);
2992           return loop_vinfo;
2993         }
2994       loop_vinfo->vector_mode = next_vector_mode;
2995
2996       bool fatal = false;
2997
2998       /* When pick_lowest_cost_p is true, we should in principle iterate
2999          over all the loop_vec_infos that LOOP_VINFO could replace and
3000          try to vectorize LOOP_VINFO under the same conditions.
3001          E.g. when trying to replace an epilogue loop, we should vectorize
3002          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
3003          to replace the main loop, we should vectorize LOOP_VINFO as a main
3004          loop too.
3005
3006          However, autovectorize_vector_modes is usually sorted as follows:
3007
3008          - Modes that naturally produce lower VFs usually follow modes that
3009            naturally produce higher VFs.
3010
3011          - When modes naturally produce the same VF, maskable modes
3012            usually follow unmaskable ones, so that the maskable mode
3013            can be used to vectorize the epilogue of the unmaskable mode.
3014
3015          This order is preferred because it leads to the maximum
3016          epilogue vectorization opportunities.  Targets should only use
3017          a different order if they want to make wide modes available while
3018          disparaging them relative to earlier, smaller modes.  The assumption
3019          in that case is that the wider modes are more expensive in some
3020          way that isn't reflected directly in the costs.
3021
3022          There should therefore be few interesting cases in which
3023          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
3024          treated as a standalone loop, and ends up being genuinely cheaper
3025          than FIRST_LOOP_VINFO.  */
3026       if (vect_epilogues)
3027         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
3028
3029       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
3030       if (mode_i == 0)
3031         autodetected_vector_mode = loop_vinfo->vector_mode;
3032       if (dump_enabled_p ())
3033         {
3034           if (res)
3035             dump_printf_loc (MSG_NOTE, vect_location,
3036                              "***** Analysis succeeded with vector mode %s\n",
3037                              GET_MODE_NAME (loop_vinfo->vector_mode));
3038           else
3039             dump_printf_loc (MSG_NOTE, vect_location,
3040                              "***** Analysis failed with vector mode %s\n",
3041                              GET_MODE_NAME (loop_vinfo->vector_mode));
3042         }
3043
3044       loop->aux = NULL;
3045
3046       if (!fatal)
3047         while (mode_i < vector_modes.length ()
3048                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3049           {
3050             if (dump_enabled_p ())
3051               dump_printf_loc (MSG_NOTE, vect_location,
3052                                "***** The result for vector mode %s would"
3053                                " be the same\n",
3054                                GET_MODE_NAME (vector_modes[mode_i]));
3055             mode_i += 1;
3056           }
3057
3058       if (res)
3059         {
3060           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3061           vectorized_loops++;
3062
3063           /* Once we hit the desired simdlen for the first time,
3064              discard any previous attempts.  */
3065           if (simdlen
3066               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3067             {
3068               delete first_loop_vinfo;
3069               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3070               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3071               simdlen = 0;
3072             }
3073           else if (pick_lowest_cost_p && first_loop_vinfo)
3074             {
3075               /* Keep trying to roll back vectorization attempts while the
3076                  loop_vec_infos they produced were worse than this one.  */
3077               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3078               while (!vinfos.is_empty ()
3079                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3080                 {
3081                   gcc_assert (vect_epilogues);
3082                   delete vinfos.pop ();
3083                 }
3084               if (vinfos.is_empty ()
3085                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3086                 {
3087                   loop_vec_info main_loop_vinfo
3088                     = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3089                   if (main_loop_vinfo == loop_vinfo)
3090                     {
3091                       delete first_loop_vinfo;
3092                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3093                     }
3094                   else if (main_loop_vinfo
3095                            && vect_joust_loop_vinfos (main_loop_vinfo,
3096                                                       first_loop_vinfo))
3097                     {
3098                       delete first_loop_vinfo;
3099                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3100                       delete loop_vinfo;
3101                       loop_vinfo
3102                         = opt_loop_vec_info::success (main_loop_vinfo);
3103                     }
3104                   else
3105                     {
3106                       if (dump_enabled_p ())
3107                         dump_printf_loc (MSG_NOTE, vect_location,
3108                                          "***** No longer preferring vector"
3109                                          " mode %s after reanalyzing the loop"
3110                                          " as a main loop\n",
3111                                          GET_MODE_NAME
3112                                            (main_loop_vinfo->vector_mode));
3113                       delete main_loop_vinfo;
3114                     }
3115                 }
3116             }
3117
3118           if (first_loop_vinfo == NULL)
3119             {
3120               first_loop_vinfo = loop_vinfo;
3121               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3122             }
3123           else if (vect_epilogues
3124                    /* For now only allow one epilogue loop.  */
3125                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
3126             {
3127               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3128               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3129               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3130                           || maybe_ne (lowest_th, 0U));
3131               /* Keep track of the known smallest versioning
3132                  threshold.  */
3133               if (ordered_p (lowest_th, th))
3134                 lowest_th = ordered_min (lowest_th, th);
3135             }
3136           else
3137             {
3138               delete loop_vinfo;
3139               loop_vinfo = opt_loop_vec_info::success (NULL);
3140             }
3141
3142           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3143              enabled, SIMDUID is not set, it is the innermost loop and we have
3144              either already found the loop's SIMDLEN or there was no SIMDLEN to
3145              begin with.
3146              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3147           vect_epilogues = (!simdlen
3148                             && loop->inner == NULL
3149                             && param_vect_epilogues_nomask
3150                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3151                             && !loop->simduid
3152                             /* For now only allow one epilogue loop, but allow
3153                                pick_lowest_cost_p to replace it.  */
3154                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3155                                 || pick_lowest_cost_p));
3156
3157           /* Commit to first_loop_vinfo if we have no reason to try
3158              alternatives.  */
3159           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3160             break;
3161         }
3162       else
3163         {
3164           delete loop_vinfo;
3165           loop_vinfo = opt_loop_vec_info::success (NULL);
3166           if (fatal)
3167             {
3168               gcc_checking_assert (first_loop_vinfo == NULL);
3169               break;
3170             }
3171         }
3172
3173       /* Handle the case that the original loop can use partial
3174          vectorization, but want to only adopt it for the epilogue.
3175          The retry should be in the same mode as original.  */
3176       if (vect_epilogues
3177           && loop_vinfo
3178           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3179         {
3180           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3181                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3182           if (dump_enabled_p ())
3183             dump_printf_loc (MSG_NOTE, vect_location,
3184                              "***** Re-trying analysis with same vector mode"
3185                              " %s for epilogue with partial vectors.\n",
3186                              GET_MODE_NAME (loop_vinfo->vector_mode));
3187           continue;
3188         }
3189
3190       if (mode_i < vector_modes.length ()
3191           && VECTOR_MODE_P (autodetected_vector_mode)
3192           && (related_vector_mode (vector_modes[mode_i],
3193                                    GET_MODE_INNER (autodetected_vector_mode))
3194               == autodetected_vector_mode)
3195           && (related_vector_mode (autodetected_vector_mode,
3196                                    GET_MODE_INNER (vector_modes[mode_i]))
3197               == vector_modes[mode_i]))
3198         {
3199           if (dump_enabled_p ())
3200             dump_printf_loc (MSG_NOTE, vect_location,
3201                              "***** Skipping vector mode %s, which would"
3202                              " repeat the analysis for %s\n",
3203                              GET_MODE_NAME (vector_modes[mode_i]),
3204                              GET_MODE_NAME (autodetected_vector_mode));
3205           mode_i += 1;
3206         }
3207
3208       if (mode_i == vector_modes.length ()
3209           || autodetected_vector_mode == VOIDmode)
3210         break;
3211
3212       /* Try the next biggest vector size.  */
3213       next_vector_mode = vector_modes[mode_i++];
3214       if (dump_enabled_p ())
3215         dump_printf_loc (MSG_NOTE, vect_location,
3216                          "***** Re-trying analysis with vector mode %s\n",
3217                          GET_MODE_NAME (next_vector_mode));
3218     }
3219
3220   if (first_loop_vinfo)
3221     {
3222       loop->aux = (loop_vec_info) first_loop_vinfo;
3223       if (dump_enabled_p ())
3224         dump_printf_loc (MSG_NOTE, vect_location,
3225                          "***** Choosing vector mode %s\n",
3226                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3227       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3228       return first_loop_vinfo;
3229     }
3230
3231   return opt_loop_vec_info::propagate_failure (res);
3232 }
3233
3234 /* Return true if there is an in-order reduction function for CODE, storing
3235    it in *REDUC_FN if so.  */
3236
3237 static bool
3238 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3239 {
3240   switch (code)
3241     {
3242     case PLUS_EXPR:
3243       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3244       return true;
3245
3246     default:
3247       return false;
3248     }
3249 }
3250
3251 /* Function reduction_fn_for_scalar_code
3252
3253    Input:
3254    CODE - tree_code of a reduction operations.
3255
3256    Output:
3257    REDUC_FN - the corresponding internal function to be used to reduce the
3258       vector of partial results into a single scalar result, or IFN_LAST
3259       if the operation is a supported reduction operation, but does not have
3260       such an internal function.
3261
3262    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3263
3264 bool
3265 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3266 {
3267   switch (code)
3268     {
3269       case MAX_EXPR:
3270         *reduc_fn = IFN_REDUC_MAX;
3271         return true;
3272
3273       case MIN_EXPR:
3274         *reduc_fn = IFN_REDUC_MIN;
3275         return true;
3276
3277       case PLUS_EXPR:
3278         *reduc_fn = IFN_REDUC_PLUS;
3279         return true;
3280
3281       case BIT_AND_EXPR:
3282         *reduc_fn = IFN_REDUC_AND;
3283         return true;
3284
3285       case BIT_IOR_EXPR:
3286         *reduc_fn = IFN_REDUC_IOR;
3287         return true;
3288
3289       case BIT_XOR_EXPR:
3290         *reduc_fn = IFN_REDUC_XOR;
3291         return true;
3292
3293       case MULT_EXPR:
3294       case MINUS_EXPR:
3295         *reduc_fn = IFN_LAST;
3296         return true;
3297
3298       default:
3299        return false;
3300     }
3301 }
3302
3303 /* If there is a neutral value X such that a reduction would not be affected
3304    by the introduction of additional X elements, return that X, otherwise
3305    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3306    of the scalar elements.  If the reduction has just a single initial value
3307    then INITIAL_VALUE is that value, otherwise it is null.  */
3308
3309 static tree
3310 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3311 {
3312   switch (code)
3313     {
3314     case WIDEN_SUM_EXPR:
3315     case DOT_PROD_EXPR:
3316     case SAD_EXPR:
3317     case PLUS_EXPR:
3318     case MINUS_EXPR:
3319     case BIT_IOR_EXPR:
3320     case BIT_XOR_EXPR:
3321       return build_zero_cst (scalar_type);
3322
3323     case MULT_EXPR:
3324       return build_one_cst (scalar_type);
3325
3326     case BIT_AND_EXPR:
3327       return build_all_ones_cst (scalar_type);
3328
3329     case MAX_EXPR:
3330     case MIN_EXPR:
3331       return initial_value;
3332
3333     default:
3334       return NULL_TREE;
3335     }
3336 }
3337
3338 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3339    STMT is printed with a message MSG. */
3340
3341 static void
3342 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3343 {
3344   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3345 }
3346
3347 /* Return true if we need an in-order reduction for operation CODE
3348    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3349    overflow must wrap.  */
3350
3351 bool
3352 needs_fold_left_reduction_p (tree type, tree_code code)
3353 {
3354   /* CHECKME: check for !flag_finite_math_only too?  */
3355   if (SCALAR_FLOAT_TYPE_P (type))
3356     switch (code)
3357       {
3358       case MIN_EXPR:
3359       case MAX_EXPR:
3360         return false;
3361
3362       default:
3363         return !flag_associative_math;
3364       }
3365
3366   if (INTEGRAL_TYPE_P (type))
3367     {
3368       if (!operation_no_trapping_overflow (type, code))
3369         return true;
3370       return false;
3371     }
3372
3373   if (SAT_FIXED_POINT_TYPE_P (type))
3374     return true;
3375
3376   return false;
3377 }
3378
3379 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3380    has a handled computation expression.  Store the main reduction
3381    operation in *CODE.  */
3382
3383 static bool
3384 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3385                       tree loop_arg, enum tree_code *code,
3386                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3387 {
3388   auto_bitmap visited;
3389   tree lookfor = PHI_RESULT (phi);
3390   ssa_op_iter curri;
3391   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3392   while (USE_FROM_PTR (curr) != loop_arg)
3393     curr = op_iter_next_use (&curri);
3394   curri.i = curri.numops;
3395   do
3396     {
3397       path.safe_push (std::make_pair (curri, curr));
3398       tree use = USE_FROM_PTR (curr);
3399       if (use == lookfor)
3400         break;
3401       gimple *def = SSA_NAME_DEF_STMT (use);
3402       if (gimple_nop_p (def)
3403           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3404         {
3405 pop:
3406           do
3407             {
3408               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3409               curri = x.first;
3410               curr = x.second;
3411               do
3412                 curr = op_iter_next_use (&curri);
3413               /* Skip already visited or non-SSA operands (from iterating
3414                  over PHI args).  */
3415               while (curr != NULL_USE_OPERAND_P
3416                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3417                          || ! bitmap_set_bit (visited,
3418                                               SSA_NAME_VERSION
3419                                                 (USE_FROM_PTR (curr)))));
3420             }
3421           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3422           if (curr == NULL_USE_OPERAND_P)
3423             break;
3424         }
3425       else
3426         {
3427           if (gimple_code (def) == GIMPLE_PHI)
3428             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3429           else
3430             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3431           while (curr != NULL_USE_OPERAND_P
3432                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3433                      || ! bitmap_set_bit (visited,
3434                                           SSA_NAME_VERSION
3435                                             (USE_FROM_PTR (curr)))))
3436             curr = op_iter_next_use (&curri);
3437           if (curr == NULL_USE_OPERAND_P)
3438             goto pop;
3439         }
3440     }
3441   while (1);
3442   if (dump_file && (dump_flags & TDF_DETAILS))
3443     {
3444       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3445       unsigned i;
3446       std::pair<ssa_op_iter, use_operand_p> *x;
3447       FOR_EACH_VEC_ELT (path, i, x)
3448         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3449       dump_printf (MSG_NOTE, "\n");
3450     }
3451
3452   /* Check whether the reduction path detected is valid.  */
3453   bool fail = path.length () == 0;
3454   bool neg = false;
3455   int sign = -1;
3456   *code = ERROR_MARK;
3457   for (unsigned i = 1; i < path.length (); ++i)
3458     {
3459       gimple *use_stmt = USE_STMT (path[i].second);
3460       tree op = USE_FROM_PTR (path[i].second);
3461       if (! is_gimple_assign (use_stmt)
3462           /* The following make sure we can compute the operand index
3463              easily plus it mostly disallows chaining via COND_EXPR condition
3464              operands.  */
3465           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3466               && (gimple_num_ops (use_stmt) <= 2
3467                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3468               && (gimple_num_ops (use_stmt) <= 3
3469                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3470         {
3471           fail = true;
3472           break;
3473         }
3474       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3475       if (use_code == MINUS_EXPR)
3476         {
3477           use_code = PLUS_EXPR;
3478           /* Track whether we negate the reduction value each iteration.  */
3479           if (gimple_assign_rhs2 (use_stmt) == op)
3480             neg = ! neg;
3481         }
3482       if (CONVERT_EXPR_CODE_P (use_code)
3483           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3484                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3485         ;
3486       else if (*code == ERROR_MARK)
3487         {
3488           *code = use_code;
3489           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3490         }
3491       else if (use_code != *code)
3492         {
3493           fail = true;
3494           break;
3495         }
3496       else if ((use_code == MIN_EXPR
3497                 || use_code == MAX_EXPR)
3498                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3499         {
3500           fail = true;
3501           break;
3502         }
3503       /* Check there's only a single stmt the op is used on.  For the
3504          not value-changing tail and the last stmt allow out-of-loop uses.
3505          ???  We could relax this and handle arbitrary live stmts by
3506          forcing a scalar epilogue for example.  */
3507       imm_use_iterator imm_iter;
3508       gimple *op_use_stmt;
3509       unsigned cnt = 0;
3510       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3511         if (!is_gimple_debug (op_use_stmt)
3512             && (*code != ERROR_MARK
3513                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3514           {
3515             /* We want to allow x + x but not x < 1 ? x : 2.  */
3516             if (is_gimple_assign (op_use_stmt)
3517                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3518               {
3519                 use_operand_p use_p;
3520                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3521                   cnt++;
3522               }
3523             else
3524               cnt++;
3525           }
3526       if (cnt != 1)
3527         {
3528           fail = true;
3529           break;
3530         }
3531     }
3532   return ! fail && ! neg && *code != ERROR_MARK;
3533 }
3534
3535 bool
3536 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3537                       tree loop_arg, enum tree_code code)
3538 {
3539   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3540   enum tree_code code_;
3541   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3542           && code_ == code);
3543 }
3544
3545
3546
3547 /* Function vect_is_simple_reduction
3548
3549    (1) Detect a cross-iteration def-use cycle that represents a simple
3550    reduction computation.  We look for the following pattern:
3551
3552    loop_header:
3553      a1 = phi < a0, a2 >
3554      a3 = ...
3555      a2 = operation (a3, a1)
3556
3557    or
3558
3559    a3 = ...
3560    loop_header:
3561      a1 = phi < a0, a2 >
3562      a2 = operation (a3, a1)
3563
3564    such that:
3565    1. operation is commutative and associative and it is safe to
3566       change the order of the computation
3567    2. no uses for a2 in the loop (a2 is used out of the loop)
3568    3. no uses of a1 in the loop besides the reduction operation
3569    4. no uses of a1 outside the loop.
3570
3571    Conditions 1,4 are tested here.
3572    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3573
3574    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3575    nested cycles.
3576
3577    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3578    reductions:
3579
3580      a1 = phi < a0, a2 >
3581      inner loop (def of a3)
3582      a2 = phi < a3 >
3583
3584    (4) Detect condition expressions, ie:
3585      for (int i = 0; i < N; i++)
3586        if (a[i] < val)
3587         ret_val = a[i];
3588
3589 */
3590
3591 static stmt_vec_info
3592 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3593                           bool *double_reduc, bool *reduc_chain_p)
3594 {
3595   gphi *phi = as_a <gphi *> (phi_info->stmt);
3596   gimple *phi_use_stmt = NULL;
3597   imm_use_iterator imm_iter;
3598   use_operand_p use_p;
3599
3600   *double_reduc = false;
3601   *reduc_chain_p = false;
3602   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3603
3604   tree phi_name = PHI_RESULT (phi);
3605   /* ???  If there are no uses of the PHI result the inner loop reduction
3606      won't be detected as possibly double-reduction by vectorizable_reduction
3607      because that tries to walk the PHI arg from the preheader edge which
3608      can be constant.  See PR60382.  */
3609   if (has_zero_uses (phi_name))
3610     return NULL;
3611   class loop *loop = (gimple_bb (phi))->loop_father;
3612   unsigned nphi_def_loop_uses = 0;
3613   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3614     {
3615       gimple *use_stmt = USE_STMT (use_p);
3616       if (is_gimple_debug (use_stmt))
3617         continue;
3618
3619       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3620         {
3621           if (dump_enabled_p ())
3622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3623                              "intermediate value used outside loop.\n");
3624
3625           return NULL;
3626         }
3627
3628       nphi_def_loop_uses++;
3629       phi_use_stmt = use_stmt;
3630     }
3631
3632   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3633   if (TREE_CODE (latch_def) != SSA_NAME)
3634     {
3635       if (dump_enabled_p ())
3636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637                          "reduction: not ssa_name: %T\n", latch_def);
3638       return NULL;
3639     }
3640
3641   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3642   if (!def_stmt_info
3643       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3644     return NULL;
3645
3646   bool nested_in_vect_loop
3647     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3648   unsigned nlatch_def_loop_uses = 0;
3649   auto_vec<gphi *, 3> lcphis;
3650   bool inner_loop_of_double_reduc = false;
3651   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3652     {
3653       gimple *use_stmt = USE_STMT (use_p);
3654       if (is_gimple_debug (use_stmt))
3655         continue;
3656       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3657         nlatch_def_loop_uses++;
3658       else
3659         {
3660           /* We can have more than one loop-closed PHI.  */
3661           lcphis.safe_push (as_a <gphi *> (use_stmt));
3662           if (nested_in_vect_loop
3663               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3664                   == vect_double_reduction_def))
3665             inner_loop_of_double_reduc = true;
3666         }
3667     }
3668
3669   /* If we are vectorizing an inner reduction we are executing that
3670      in the original order only in case we are not dealing with a
3671      double reduction.  */
3672   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3673     {
3674       if (dump_enabled_p ())
3675         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3676                         "detected nested cycle: ");
3677       return def_stmt_info;
3678     }
3679
3680   /* If this isn't a nested cycle or if the nested cycle reduction value
3681      is used ouside of the inner loop we cannot handle uses of the reduction
3682      value.  */
3683   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3684     {
3685       if (dump_enabled_p ())
3686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3687                          "reduction used in loop.\n");
3688       return NULL;
3689     }
3690
3691   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3692      defined in the inner loop.  */
3693   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3694     {
3695       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3696       if (gimple_phi_num_args (def_stmt) != 1
3697           || TREE_CODE (op1) != SSA_NAME)
3698         {
3699           if (dump_enabled_p ())
3700             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3701                              "unsupported phi node definition.\n");
3702
3703           return NULL;
3704         }
3705
3706       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3707       if (gimple_bb (def1)
3708           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3709           && loop->inner
3710           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3711           && is_gimple_assign (def1)
3712           && is_a <gphi *> (phi_use_stmt)
3713           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3714         {
3715           if (dump_enabled_p ())
3716             report_vect_op (MSG_NOTE, def_stmt,
3717                             "detected double reduction: ");
3718
3719           *double_reduc = true;
3720           return def_stmt_info;
3721         }
3722
3723       return NULL;
3724     }
3725
3726   /* Look for the expression computing latch_def from then loop PHI result.  */
3727   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3728   enum tree_code code;
3729   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3730                             path))
3731     {
3732       STMT_VINFO_REDUC_CODE (phi_info) = code;
3733       if (code == COND_EXPR && !nested_in_vect_loop)
3734         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3735
3736       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3737          reduction chain for which the additional restriction is that
3738          all operations in the chain are the same.  */
3739       auto_vec<stmt_vec_info, 8> reduc_chain;
3740       unsigned i;
3741       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3742       for (i = path.length () - 1; i >= 1; --i)
3743         {
3744           gimple *stmt = USE_STMT (path[i].second);
3745           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3746           STMT_VINFO_REDUC_IDX (stmt_info)
3747             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3748           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3749           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3750                                      && (i == 1 || i == path.length () - 1));
3751           if ((stmt_code != code && !leading_conversion)
3752               /* We can only handle the final value in epilogue
3753                  generation for reduction chains.  */
3754               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3755             is_slp_reduc = false;
3756           /* For reduction chains we support a trailing/leading
3757              conversions.  We do not store those in the actual chain.  */
3758           if (leading_conversion)
3759             continue;
3760           reduc_chain.safe_push (stmt_info);
3761         }
3762       if (is_slp_reduc && reduc_chain.length () > 1)
3763         {
3764           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3765             {
3766               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3767               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3768             }
3769           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3770           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3771
3772           /* Save the chain for further analysis in SLP detection.  */
3773           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3774           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3775
3776           *reduc_chain_p = true;
3777           if (dump_enabled_p ())
3778             dump_printf_loc (MSG_NOTE, vect_location,
3779                             "reduction: detected reduction chain\n");
3780         }
3781       else if (dump_enabled_p ())
3782         dump_printf_loc (MSG_NOTE, vect_location,
3783                          "reduction: detected reduction\n");
3784
3785       return def_stmt_info;
3786     }
3787
3788   if (dump_enabled_p ())
3789     dump_printf_loc (MSG_NOTE, vect_location,
3790                      "reduction: unknown pattern\n");
3791
3792   return NULL;
3793 }
3794
3795 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3796    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3797    or -1 if not known.  */
3798
3799 static int
3800 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3801 {
3802   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3803   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3804     {
3805       if (dump_enabled_p ())
3806         dump_printf_loc (MSG_NOTE, vect_location,
3807                          "cost model: epilogue peel iters set to vf/2 "
3808                          "because loop iterations are unknown .\n");
3809       return assumed_vf / 2;
3810     }
3811   else
3812     {
3813       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3814       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3815       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3816       /* If we need to peel for gaps, but no peeling is required, we have to
3817          peel VF iterations.  */
3818       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3819         peel_iters_epilogue = assumed_vf;
3820       return peel_iters_epilogue;
3821     }
3822 }
3823
3824 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3825 int
3826 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3827                              int *peel_iters_epilogue,
3828                              stmt_vector_for_cost *scalar_cost_vec,
3829                              stmt_vector_for_cost *prologue_cost_vec,
3830                              stmt_vector_for_cost *epilogue_cost_vec)
3831 {
3832   int retval = 0;
3833
3834   *peel_iters_epilogue
3835     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3836
3837   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3838     {
3839       /* If peeled iterations are known but number of scalar loop
3840          iterations are unknown, count a taken branch per peeled loop.  */
3841       if (peel_iters_prologue > 0)
3842         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3843                                    NULL, NULL_TREE, 0, vect_prologue);
3844       if (*peel_iters_epilogue > 0)
3845         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3846                                     NULL, NULL_TREE, 0, vect_epilogue);
3847     }
3848
3849   stmt_info_for_cost *si;
3850   int j;
3851   if (peel_iters_prologue)
3852     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3853       retval += record_stmt_cost (prologue_cost_vec,
3854                                   si->count * peel_iters_prologue,
3855                                   si->kind, si->stmt_info, si->misalign,
3856                                   vect_prologue);
3857   if (*peel_iters_epilogue)
3858     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3859       retval += record_stmt_cost (epilogue_cost_vec,
3860                                   si->count * *peel_iters_epilogue,
3861                                   si->kind, si->stmt_info, si->misalign,
3862                                   vect_epilogue);
3863
3864   return retval;
3865 }
3866
3867 /* Function vect_estimate_min_profitable_iters
3868
3869    Return the number of iterations required for the vector version of the
3870    loop to be profitable relative to the cost of the scalar version of the
3871    loop.
3872
3873    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3874    of iterations for vectorization.  -1 value means loop vectorization
3875    is not profitable.  This returned value may be used for dynamic
3876    profitability check.
3877
3878    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3879    for static check against estimated number of iterations.  */
3880
3881 static void
3882 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3883                                     int *ret_min_profitable_niters,
3884                                     int *ret_min_profitable_estimate)
3885 {
3886   int min_profitable_iters;
3887   int min_profitable_estimate;
3888   int peel_iters_prologue;
3889   int peel_iters_epilogue;
3890   unsigned vec_inside_cost = 0;
3891   int vec_outside_cost = 0;
3892   unsigned vec_prologue_cost = 0;
3893   unsigned vec_epilogue_cost = 0;
3894   int scalar_single_iter_cost = 0;
3895   int scalar_outside_cost = 0;
3896   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3897   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3898   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3899
3900   /* Cost model disabled.  */
3901   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3902     {
3903       if (dump_enabled_p ())
3904         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3905       *ret_min_profitable_niters = 0;
3906       *ret_min_profitable_estimate = 0;
3907       return;
3908     }
3909
3910   /* Requires loop versioning tests to handle misalignment.  */
3911   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3912     {
3913       /*  FIXME: Make cost depend on complexity of individual check.  */
3914       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3915       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3916                             NULL, NULL_TREE, 0, vect_prologue);
3917       if (dump_enabled_p ())
3918         dump_printf (MSG_NOTE,
3919                      "cost model: Adding cost of checks for loop "
3920                      "versioning to treat misalignment.\n");
3921     }
3922
3923   /* Requires loop versioning with alias checks.  */
3924   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3925     {
3926       /*  FIXME: Make cost depend on complexity of individual check.  */
3927       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3928       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3929                             NULL, NULL_TREE, 0, vect_prologue);
3930       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3931       if (len)
3932         /* Count LEN - 1 ANDs and LEN comparisons.  */
3933         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3934                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3935       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3936       if (len)
3937         {
3938           /* Count LEN - 1 ANDs and LEN comparisons.  */
3939           unsigned int nstmts = len * 2 - 1;
3940           /* +1 for each bias that needs adding.  */
3941           for (unsigned int i = 0; i < len; ++i)
3942             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3943               nstmts += 1;
3944           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3945                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3946         }
3947       if (dump_enabled_p ())
3948         dump_printf (MSG_NOTE,
3949                      "cost model: Adding cost of checks for loop "
3950                      "versioning aliasing.\n");
3951     }
3952
3953   /* Requires loop versioning with niter checks.  */
3954   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3955     {
3956       /*  FIXME: Make cost depend on complexity of individual check.  */
3957       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3958                             NULL, NULL_TREE, 0, vect_prologue);
3959       if (dump_enabled_p ())
3960         dump_printf (MSG_NOTE,
3961                      "cost model: Adding cost of checks for loop "
3962                      "versioning niters.\n");
3963     }
3964
3965   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3966     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3967                           NULL, NULL_TREE, 0, vect_prologue);
3968
3969   /* Count statements in scalar loop.  Using this as scalar cost for a single
3970      iteration for now.
3971
3972      TODO: Add outer loop support.
3973
3974      TODO: Consider assigning different costs to different scalar
3975      statements.  */
3976
3977   scalar_single_iter_cost
3978     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3979
3980   /* Add additional cost for the peeled instructions in prologue and epilogue
3981      loop.  (For fully-masked loops there will be no peeling.)
3982
3983      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3984      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3985
3986      TODO: Build an expression that represents peel_iters for prologue and
3987      epilogue to be used in a run-time test.  */
3988
3989   bool prologue_need_br_taken_cost = false;
3990   bool prologue_need_br_not_taken_cost = false;
3991
3992   /* Calculate peel_iters_prologue.  */
3993   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3994     peel_iters_prologue = 0;
3995   else if (npeel < 0)
3996     {
3997       peel_iters_prologue = assumed_vf / 2;
3998       if (dump_enabled_p ())
3999         dump_printf (MSG_NOTE, "cost model: "
4000                      "prologue peel iters set to vf/2.\n");
4001
4002       /* If peeled iterations are unknown, count a taken branch and a not taken
4003          branch per peeled loop.  Even if scalar loop iterations are known,
4004          vector iterations are not known since peeled prologue iterations are
4005          not known.  Hence guards remain the same.  */
4006       prologue_need_br_taken_cost = true;
4007       prologue_need_br_not_taken_cost = true;
4008     }
4009   else
4010     {
4011       peel_iters_prologue = npeel;
4012       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4013         /* If peeled iterations are known but number of scalar loop
4014            iterations are unknown, count a taken branch per peeled loop.  */
4015         prologue_need_br_taken_cost = true;
4016     }
4017
4018   bool epilogue_need_br_taken_cost = false;
4019   bool epilogue_need_br_not_taken_cost = false;
4020
4021   /* Calculate peel_iters_epilogue.  */
4022   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4023     /* We need to peel exactly one iteration for gaps.  */
4024     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4025   else if (npeel < 0)
4026     {
4027       /* If peeling for alignment is unknown, loop bound of main loop
4028          becomes unknown.  */
4029       peel_iters_epilogue = assumed_vf / 2;
4030       if (dump_enabled_p ())
4031         dump_printf (MSG_NOTE, "cost model: "
4032                      "epilogue peel iters set to vf/2 because "
4033                      "peeling for alignment is unknown.\n");
4034
4035       /* See the same reason above in peel_iters_prologue calculation.  */
4036       epilogue_need_br_taken_cost = true;
4037       epilogue_need_br_not_taken_cost = true;
4038     }
4039   else
4040     {
4041       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4042       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4043         /* If peeled iterations are known but number of scalar loop
4044            iterations are unknown, count a taken branch per peeled loop.  */
4045         epilogue_need_br_taken_cost = true;
4046     }
4047
4048   stmt_info_for_cost *si;
4049   int j;
4050   /* Add costs associated with peel_iters_prologue.  */
4051   if (peel_iters_prologue)
4052     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4053       {
4054         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4055                               si->count * peel_iters_prologue, si->kind,
4056                               si->stmt_info, si->vectype, si->misalign,
4057                               vect_prologue);
4058       }
4059
4060   /* Add costs associated with peel_iters_epilogue.  */
4061   if (peel_iters_epilogue)
4062     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4063       {
4064         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4065                               si->count * peel_iters_epilogue, si->kind,
4066                               si->stmt_info, si->vectype, si->misalign,
4067                               vect_epilogue);
4068       }
4069
4070   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4071
4072   if (prologue_need_br_taken_cost)
4073     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4074                           NULL, NULL_TREE, 0, vect_prologue);
4075
4076   if (prologue_need_br_not_taken_cost)
4077     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4078                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4079                           vect_prologue);
4080
4081   if (epilogue_need_br_taken_cost)
4082     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4083                           NULL, NULL_TREE, 0, vect_epilogue);
4084
4085   if (epilogue_need_br_not_taken_cost)
4086     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4087                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4088                           vect_epilogue);
4089
4090   /* Take care of special costs for rgroup controls of partial vectors.  */
4091   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4092     {
4093       /* Calculate how many masks we need to generate.  */
4094       unsigned int num_masks = 0;
4095       rgroup_controls *rgm;
4096       unsigned int num_vectors_m1;
4097       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4098         if (rgm->type)
4099           num_masks += num_vectors_m1 + 1;
4100       gcc_assert (num_masks > 0);
4101
4102       /* In the worst case, we need to generate each mask in the prologue
4103          and in the loop body.  One of the loop body mask instructions
4104          replaces the comparison in the scalar loop, and since we don't
4105          count the scalar comparison against the scalar body, we shouldn't
4106          count that vector instruction against the vector body either.
4107
4108          Sometimes we can use unpacks instead of generating prologue
4109          masks and sometimes the prologue mask will fold to a constant,
4110          so the actual prologue cost might be smaller.  However, it's
4111          simpler and safer to use the worst-case cost; if this ends up
4112          being the tie-breaker between vectorizing or not, then it's
4113          probably better not to vectorize.  */
4114       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4115                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4116       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4117                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4118     }
4119   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4120     {
4121       /* Referring to the functions vect_set_loop_condition_partial_vectors
4122          and vect_set_loop_controls_directly, we need to generate each
4123          length in the prologue and in the loop body if required. Although
4124          there are some possible optimizations, we consider the worst case
4125          here.  */
4126
4127       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4128       bool need_iterate_p
4129         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4130            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4131
4132       /* Calculate how many statements to be added.  */
4133       unsigned int prologue_stmts = 0;
4134       unsigned int body_stmts = 0;
4135
4136       rgroup_controls *rgc;
4137       unsigned int num_vectors_m1;
4138       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4139         if (rgc->type)
4140           {
4141             /* May need one SHIFT for nitems_total computation.  */
4142             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4143             if (nitems != 1 && !niters_known_p)
4144               prologue_stmts += 1;
4145
4146             /* May need one MAX and one MINUS for wrap around.  */
4147             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4148               prologue_stmts += 2;
4149
4150             /* Need one MAX and one MINUS for each batch limit excepting for
4151                the 1st one.  */
4152             prologue_stmts += num_vectors_m1 * 2;
4153
4154             unsigned int num_vectors = num_vectors_m1 + 1;
4155
4156             /* Need to set up lengths in prologue, only one MIN required
4157                for each since start index is zero.  */
4158             prologue_stmts += num_vectors;
4159
4160             /* Each may need two MINs and one MINUS to update lengths in body
4161                for next iteration.  */
4162             if (need_iterate_p)
4163               body_stmts += 3 * num_vectors;
4164           }
4165
4166       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4167                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4168       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4169                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4170     }
4171
4172   /* FORNOW: The scalar outside cost is incremented in one of the
4173      following ways:
4174
4175      1. The vectorizer checks for alignment and aliasing and generates
4176      a condition that allows dynamic vectorization.  A cost model
4177      check is ANDED with the versioning condition.  Hence scalar code
4178      path now has the added cost of the versioning check.
4179
4180        if (cost > th & versioning_check)
4181          jmp to vector code
4182
4183      Hence run-time scalar is incremented by not-taken branch cost.
4184
4185      2. The vectorizer then checks if a prologue is required.  If the
4186      cost model check was not done before during versioning, it has to
4187      be done before the prologue check.
4188
4189        if (cost <= th)
4190          prologue = scalar_iters
4191        if (prologue == 0)
4192          jmp to vector code
4193        else
4194          execute prologue
4195        if (prologue == num_iters)
4196          go to exit
4197
4198      Hence the run-time scalar cost is incremented by a taken branch,
4199      plus a not-taken branch, plus a taken branch cost.
4200
4201      3. The vectorizer then checks if an epilogue is required.  If the
4202      cost model check was not done before during prologue check, it
4203      has to be done with the epilogue check.
4204
4205        if (prologue == 0)
4206          jmp to vector code
4207        else
4208          execute prologue
4209        if (prologue == num_iters)
4210          go to exit
4211        vector code:
4212          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4213            jmp to epilogue
4214
4215      Hence the run-time scalar cost should be incremented by 2 taken
4216      branches.
4217
4218      TODO: The back end may reorder the BBS's differently and reverse
4219      conditions/branch directions.  Change the estimates below to
4220      something more reasonable.  */
4221
4222   /* If the number of iterations is known and we do not do versioning, we can
4223      decide whether to vectorize at compile time.  Hence the scalar version
4224      do not carry cost model guard costs.  */
4225   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4226       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4227     {
4228       /* Cost model check occurs at versioning.  */
4229       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4230         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4231       else
4232         {
4233           /* Cost model check occurs at prologue generation.  */
4234           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4235             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4236               + vect_get_stmt_cost (cond_branch_not_taken);
4237           /* Cost model check occurs at epilogue generation.  */
4238           else
4239             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4240         }
4241     }
4242
4243   /* Complete the target-specific cost calculations.  */
4244   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4245                &vec_inside_cost, &vec_epilogue_cost);
4246
4247   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4248
4249   /* Stash the costs so that we can compare two loop_vec_infos.  */
4250   loop_vinfo->vec_inside_cost = vec_inside_cost;
4251   loop_vinfo->vec_outside_cost = vec_outside_cost;
4252
4253   if (dump_enabled_p ())
4254     {
4255       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4256       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4257                    vec_inside_cost);
4258       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4259                    vec_prologue_cost);
4260       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4261                    vec_epilogue_cost);
4262       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4263                    scalar_single_iter_cost);
4264       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4265                    scalar_outside_cost);
4266       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4267                    vec_outside_cost);
4268       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4269                    peel_iters_prologue);
4270       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4271                    peel_iters_epilogue);
4272     }
4273
4274   /* Calculate number of iterations required to make the vector version
4275      profitable, relative to the loop bodies only.  The following condition
4276      must hold true:
4277      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4278      where
4279      SIC = scalar iteration cost, VIC = vector iteration cost,
4280      VOC = vector outside cost, VF = vectorization factor,
4281      NPEEL = prologue iterations + epilogue iterations,
4282      SOC = scalar outside cost for run time cost model check.  */
4283
4284   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4285                           - vec_inside_cost);
4286   if (saving_per_viter <= 0)
4287     {
4288       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4289         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4290                     "vectorization did not happen for a simd loop");
4291
4292       if (dump_enabled_p ())
4293         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294                          "cost model: the vector iteration cost = %d "
4295                          "divided by the scalar iteration cost = %d "
4296                          "is greater or equal to the vectorization factor = %d"
4297                          ".\n",
4298                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4299       *ret_min_profitable_niters = -1;
4300       *ret_min_profitable_estimate = -1;
4301       return;
4302     }
4303
4304   /* ??? The "if" arm is written to handle all cases; see below for what
4305      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4306   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4307     {
4308       /* Rewriting the condition above in terms of the number of
4309          vector iterations (vniters) rather than the number of
4310          scalar iterations (niters) gives:
4311
4312          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4313
4314          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4315
4316          For integer N, X and Y when X > 0:
4317
4318          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4319       int outside_overhead = (vec_outside_cost
4320                               - scalar_single_iter_cost * peel_iters_prologue
4321                               - scalar_single_iter_cost * peel_iters_epilogue
4322                               - scalar_outside_cost);
4323       /* We're only interested in cases that require at least one
4324          vector iteration.  */
4325       int min_vec_niters = 1;
4326       if (outside_overhead > 0)
4327         min_vec_niters = outside_overhead / saving_per_viter + 1;
4328
4329       if (dump_enabled_p ())
4330         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4331                      min_vec_niters);
4332
4333       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4334         {
4335           /* Now that we know the minimum number of vector iterations,
4336              find the minimum niters for which the scalar cost is larger:
4337
4338              SIC * niters > VIC * vniters + VOC - SOC
4339
4340              We know that the minimum niters is no more than
4341              vniters * VF + NPEEL, but it might be (and often is) less
4342              than that if a partial vector iteration is cheaper than the
4343              equivalent scalar code.  */
4344           int threshold = (vec_inside_cost * min_vec_niters
4345                            + vec_outside_cost
4346                            - scalar_outside_cost);
4347           if (threshold <= 0)
4348             min_profitable_iters = 1;
4349           else
4350             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4351         }
4352       else
4353         /* Convert the number of vector iterations into a number of
4354            scalar iterations.  */
4355         min_profitable_iters = (min_vec_niters * assumed_vf
4356                                 + peel_iters_prologue
4357                                 + peel_iters_epilogue);
4358     }
4359   else
4360     {
4361       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4362                               * assumed_vf
4363                               - vec_inside_cost * peel_iters_prologue
4364                               - vec_inside_cost * peel_iters_epilogue);
4365       if (min_profitable_iters <= 0)
4366         min_profitable_iters = 0;
4367       else
4368         {
4369           min_profitable_iters /= saving_per_viter;
4370
4371           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4372               <= (((int) vec_inside_cost * min_profitable_iters)
4373                   + (((int) vec_outside_cost - scalar_outside_cost)
4374                      * assumed_vf)))
4375             min_profitable_iters++;
4376         }
4377     }
4378
4379   if (dump_enabled_p ())
4380     dump_printf (MSG_NOTE,
4381                  "  Calculated minimum iters for profitability: %d\n",
4382                  min_profitable_iters);
4383
4384   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4385       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4386     /* We want the vectorized loop to execute at least once.  */
4387     min_profitable_iters = assumed_vf + peel_iters_prologue;
4388   else if (min_profitable_iters < peel_iters_prologue)
4389     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4390        vectorized loop executes at least once.  */
4391     min_profitable_iters = peel_iters_prologue;
4392
4393   if (dump_enabled_p ())
4394     dump_printf_loc (MSG_NOTE, vect_location,
4395                      "  Runtime profitability threshold = %d\n",
4396                      min_profitable_iters);
4397
4398   *ret_min_profitable_niters = min_profitable_iters;
4399
4400   /* Calculate number of iterations required to make the vector version
4401      profitable, relative to the loop bodies only.
4402
4403      Non-vectorized variant is SIC * niters and it must win over vector
4404      variant on the expected loop trip count.  The following condition must hold true:
4405      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4406
4407   if (vec_outside_cost <= 0)
4408     min_profitable_estimate = 0;
4409   /* ??? This "else if" arm is written to handle all cases; see below for
4410      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4411   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4412     {
4413       /* This is a repeat of the code above, but with + SOC rather
4414          than - SOC.  */
4415       int outside_overhead = (vec_outside_cost
4416                               - scalar_single_iter_cost * peel_iters_prologue
4417                               - scalar_single_iter_cost * peel_iters_epilogue
4418                               + scalar_outside_cost);
4419       int min_vec_niters = 1;
4420       if (outside_overhead > 0)
4421         min_vec_niters = outside_overhead / saving_per_viter + 1;
4422
4423       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4424         {
4425           int threshold = (vec_inside_cost * min_vec_niters
4426                            + vec_outside_cost
4427                            + scalar_outside_cost);
4428           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4429         }
4430       else
4431         min_profitable_estimate = (min_vec_niters * assumed_vf
4432                                    + peel_iters_prologue
4433                                    + peel_iters_epilogue);
4434     }
4435   else
4436     {
4437       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4438                                  * assumed_vf
4439                                  - vec_inside_cost * peel_iters_prologue
4440                                  - vec_inside_cost * peel_iters_epilogue)
4441                                  / ((scalar_single_iter_cost * assumed_vf)
4442                                    - vec_inside_cost);
4443     }
4444   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4445   if (dump_enabled_p ())
4446     dump_printf_loc (MSG_NOTE, vect_location,
4447                      "  Static estimate profitability threshold = %d\n",
4448                      min_profitable_estimate);
4449
4450   *ret_min_profitable_estimate = min_profitable_estimate;
4451 }
4452
4453 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4454    vector elements (not bits) for a vector with NELT elements.  */
4455 static void
4456 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4457                               vec_perm_builder *sel)
4458 {
4459   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4460      by vec_perm_indices.  */
4461   sel->new_vector (nelt, 1, 3);
4462   for (unsigned int i = 0; i < 3; i++)
4463     sel->quick_push (i + offset);
4464 }
4465
4466 /* Checks whether the target supports whole-vector shifts for vectors of mode
4467    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4468    it supports vec_perm_const with masks for all necessary shift amounts.  */
4469 static bool
4470 have_whole_vector_shift (machine_mode mode)
4471 {
4472   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4473     return true;
4474
4475   /* Variable-length vectors should be handled via the optab.  */
4476   unsigned int nelt;
4477   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4478     return false;
4479
4480   vec_perm_builder sel;
4481   vec_perm_indices indices;
4482   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4483     {
4484       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4485       indices.new_vector (sel, 2, nelt);
4486       if (!can_vec_perm_const_p (mode, indices, false))
4487         return false;
4488     }
4489   return true;
4490 }
4491
4492 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4493    functions. Design better to avoid maintenance issues.  */
4494
4495 /* Function vect_model_reduction_cost.
4496
4497    Models cost for a reduction operation, including the vector ops
4498    generated within the strip-mine loop in some cases, the initial
4499    definition before the loop, and the epilogue code that must be generated.  */
4500
4501 static void
4502 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4503                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4504                            vect_reduction_type reduction_type,
4505                            int ncopies, stmt_vector_for_cost *cost_vec)
4506 {
4507   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4508   enum tree_code code;
4509   optab optab;
4510   tree vectype;
4511   machine_mode mode;
4512   class loop *loop = NULL;
4513
4514   if (loop_vinfo)
4515     loop = LOOP_VINFO_LOOP (loop_vinfo);
4516
4517   /* Condition reductions generate two reductions in the loop.  */
4518   if (reduction_type == COND_REDUCTION)
4519     ncopies *= 2;
4520
4521   vectype = STMT_VINFO_VECTYPE (stmt_info);
4522   mode = TYPE_MODE (vectype);
4523   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4524
4525   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4526
4527   if (reduction_type == EXTRACT_LAST_REDUCTION)
4528     /* No extra instructions are needed in the prologue.  The loop body
4529        operations are costed in vectorizable_condition.  */
4530     inside_cost = 0;
4531   else if (reduction_type == FOLD_LEFT_REDUCTION)
4532     {
4533       /* No extra instructions needed in the prologue.  */
4534       prologue_cost = 0;
4535
4536       if (reduc_fn != IFN_LAST)
4537         /* Count one reduction-like operation per vector.  */
4538         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4539                                         stmt_info, 0, vect_body);
4540       else
4541         {
4542           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4543           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4544           inside_cost = record_stmt_cost (cost_vec, nelements,
4545                                           vec_to_scalar, stmt_info, 0,
4546                                           vect_body);
4547           inside_cost += record_stmt_cost (cost_vec, nelements,
4548                                            scalar_stmt, stmt_info, 0,
4549                                            vect_body);
4550         }
4551     }
4552   else
4553     {
4554       /* Add in cost for initial definition.
4555          For cond reduction we have four vectors: initial index, step,
4556          initial result of the data reduction, initial value of the index
4557          reduction.  */
4558       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4559       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4560                                          scalar_to_vec, stmt_info, 0,
4561                                          vect_prologue);
4562     }
4563
4564   /* Determine cost of epilogue code.
4565
4566      We have a reduction operator that will reduce the vector in one statement.
4567      Also requires scalar extract.  */
4568
4569   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4570     {
4571       if (reduc_fn != IFN_LAST)
4572         {
4573           if (reduction_type == COND_REDUCTION)
4574             {
4575               /* An EQ stmt and an COND_EXPR stmt.  */
4576               epilogue_cost += record_stmt_cost (cost_vec, 2,
4577                                                  vector_stmt, stmt_info, 0,
4578                                                  vect_epilogue);
4579               /* Reduction of the max index and a reduction of the found
4580                  values.  */
4581               epilogue_cost += record_stmt_cost (cost_vec, 2,
4582                                                  vec_to_scalar, stmt_info, 0,
4583                                                  vect_epilogue);
4584               /* A broadcast of the max value.  */
4585               epilogue_cost += record_stmt_cost (cost_vec, 1,
4586                                                  scalar_to_vec, stmt_info, 0,
4587                                                  vect_epilogue);
4588             }
4589           else
4590             {
4591               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4592                                                  stmt_info, 0, vect_epilogue);
4593               epilogue_cost += record_stmt_cost (cost_vec, 1,
4594                                                  vec_to_scalar, stmt_info, 0,
4595                                                  vect_epilogue);
4596             }
4597         }
4598       else if (reduction_type == COND_REDUCTION)
4599         {
4600           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4601           /* Extraction of scalar elements.  */
4602           epilogue_cost += record_stmt_cost (cost_vec,
4603                                              2 * estimated_nunits,
4604                                              vec_to_scalar, stmt_info, 0,
4605                                              vect_epilogue);
4606           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4607           epilogue_cost += record_stmt_cost (cost_vec,
4608                                              2 * estimated_nunits - 3,
4609                                              scalar_stmt, stmt_info, 0,
4610                                              vect_epilogue);
4611         }
4612       else if (reduction_type == EXTRACT_LAST_REDUCTION
4613                || reduction_type == FOLD_LEFT_REDUCTION)
4614         /* No extra instructions need in the epilogue.  */
4615         ;
4616       else
4617         {
4618           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4619           tree bitsize =
4620             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4621           int element_bitsize = tree_to_uhwi (bitsize);
4622           int nelements = vec_size_in_bits / element_bitsize;
4623
4624           if (code == COND_EXPR)
4625             code = MAX_EXPR;
4626
4627           optab = optab_for_tree_code (code, vectype, optab_default);
4628
4629           /* We have a whole vector shift available.  */
4630           if (optab != unknown_optab
4631               && VECTOR_MODE_P (mode)
4632               && optab_handler (optab, mode) != CODE_FOR_nothing
4633               && have_whole_vector_shift (mode))
4634             {
4635               /* Final reduction via vector shifts and the reduction operator.
4636                  Also requires scalar extract.  */
4637               epilogue_cost += record_stmt_cost (cost_vec,
4638                                                  exact_log2 (nelements) * 2,
4639                                                  vector_stmt, stmt_info, 0,
4640                                                  vect_epilogue);
4641               epilogue_cost += record_stmt_cost (cost_vec, 1,
4642                                                  vec_to_scalar, stmt_info, 0,
4643                                                  vect_epilogue);
4644             }
4645           else
4646             /* Use extracts and reduction op for final reduction.  For N
4647                elements, we have N extracts and N-1 reduction ops.  */
4648             epilogue_cost += record_stmt_cost (cost_vec,
4649                                                nelements + nelements - 1,
4650                                                vector_stmt, stmt_info, 0,
4651                                                vect_epilogue);
4652         }
4653     }
4654
4655   if (dump_enabled_p ())
4656     dump_printf (MSG_NOTE,
4657                  "vect_model_reduction_cost: inside_cost = %d, "
4658                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4659                  prologue_cost, epilogue_cost);
4660 }
4661
4662 /* SEQ is a sequence of instructions that initialize the reduction
4663    described by REDUC_INFO.  Emit them in the appropriate place.  */
4664
4665 static void
4666 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4667                                 stmt_vec_info reduc_info, gimple *seq)
4668 {
4669   if (reduc_info->reused_accumulator)
4670     {
4671       /* When reusing an accumulator from the main loop, we only need
4672          initialization instructions if the main loop can be skipped.
4673          In that case, emit the initialization instructions at the end
4674          of the guard block that does the skip.  */
4675       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4676       gcc_assert (skip_edge);
4677       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4678       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4679     }
4680   else
4681     {
4682       /* The normal case: emit the initialization instructions on the
4683          preheader edge.  */
4684       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4685       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4686     }
4687 }
4688
4689 /* Function get_initial_def_for_reduction
4690
4691    Input:
4692    REDUC_INFO - the info_for_reduction
4693    INIT_VAL - the initial value of the reduction variable
4694    NEUTRAL_OP - a value that has no effect on the reduction, as per
4695                 neutral_op_for_reduction
4696
4697    Output:
4698    Return a vector variable, initialized according to the operation that
4699         STMT_VINFO performs. This vector will be used as the initial value
4700         of the vector of partial results.
4701
4702    The value we need is a vector in which element 0 has value INIT_VAL
4703    and every other element has value NEUTRAL_OP.  */
4704
4705 static tree
4706 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4707                                stmt_vec_info reduc_info,
4708                                tree init_val, tree neutral_op)
4709 {
4710   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4711   tree scalar_type = TREE_TYPE (init_val);
4712   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4713   tree init_def;
4714   gimple_seq stmts = NULL;
4715
4716   gcc_assert (vectype);
4717
4718   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4719               || SCALAR_FLOAT_TYPE_P (scalar_type));
4720
4721   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4722               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4723
4724   if (operand_equal_p (init_val, neutral_op))
4725     {
4726       /* If both elements are equal then the vector described above is
4727          just a splat.  */
4728       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4729       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4730     }
4731   else
4732     {
4733       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4734       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4735       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4736         {
4737           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4738              element 0.  */
4739           init_def = gimple_build_vector_from_val (&stmts, vectype,
4740                                                    neutral_op);
4741           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4742                                    vectype, init_def, init_val);
4743         }
4744       else
4745         {
4746           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4747           tree_vector_builder elts (vectype, 1, 2);
4748           elts.quick_push (init_val);
4749           elts.quick_push (neutral_op);
4750           init_def = gimple_build_vector (&stmts, &elts);
4751         }
4752     }
4753
4754   if (stmts)
4755     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4756   return init_def;
4757 }
4758
4759 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4760    which performs a reduction involving GROUP_SIZE scalar statements.
4761    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4762    is nonnull, introducing extra elements of that value will not change the
4763    result.  */
4764
4765 static void
4766 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4767                                 stmt_vec_info reduc_info,
4768                                 vec<tree> *vec_oprnds,
4769                                 unsigned int number_of_vectors,
4770                                 unsigned int group_size, tree neutral_op)
4771 {
4772   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4773   unsigned HOST_WIDE_INT nunits;
4774   unsigned j, number_of_places_left_in_vector;
4775   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4776   unsigned int i;
4777
4778   gcc_assert (group_size == initial_values.length () || neutral_op);
4779
4780   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4781      created vectors. It is greater than 1 if unrolling is performed.
4782
4783      For example, we have two scalar operands, s1 and s2 (e.g., group of
4784      strided accesses of size two), while NUNITS is four (i.e., four scalars
4785      of this type can be packed in a vector).  The output vector will contain
4786      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4787      will be 2).
4788
4789      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4790      vectors containing the operands.
4791
4792      For example, NUNITS is four as before, and the group size is 8
4793      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4794      {s5, s6, s7, s8}.  */
4795
4796   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4797     nunits = group_size;
4798
4799   number_of_places_left_in_vector = nunits;
4800   bool constant_p = true;
4801   tree_vector_builder elts (vector_type, nunits, 1);
4802   elts.quick_grow (nunits);
4803   gimple_seq ctor_seq = NULL;
4804   for (j = 0; j < nunits * number_of_vectors; ++j)
4805     {
4806       tree op;
4807       i = j % group_size;
4808
4809       /* Get the def before the loop.  In reduction chain we have only
4810          one initial value.  Else we have as many as PHIs in the group.  */
4811       if (i >= initial_values.length () || (j > i && neutral_op))
4812         op = neutral_op;
4813       else
4814         op = initial_values[i];
4815
4816       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4817       number_of_places_left_in_vector--;
4818       elts[nunits - number_of_places_left_in_vector - 1] = op;
4819       if (!CONSTANT_CLASS_P (op))
4820         constant_p = false;
4821
4822       if (number_of_places_left_in_vector == 0)
4823         {
4824           tree init;
4825           if (constant_p && !neutral_op
4826               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4827               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4828             /* Build the vector directly from ELTS.  */
4829             init = gimple_build_vector (&ctor_seq, &elts);
4830           else if (neutral_op)
4831             {
4832               /* Build a vector of the neutral value and shift the
4833                  other elements into place.  */
4834               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4835                                                    neutral_op);
4836               int k = nunits;
4837               while (k > 0 && elts[k - 1] == neutral_op)
4838                 k -= 1;
4839               while (k > 0)
4840                 {
4841                   k -= 1;
4842                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4843                                        vector_type, init, elts[k]);
4844                 }
4845             }
4846           else
4847             {
4848               /* First time round, duplicate ELTS to fill the
4849                  required number of vectors.  */
4850               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4851                                         elts, number_of_vectors, *vec_oprnds);
4852               break;
4853             }
4854           vec_oprnds->quick_push (init);
4855
4856           number_of_places_left_in_vector = nunits;
4857           elts.new_vector (vector_type, nunits, 1);
4858           elts.quick_grow (nunits);
4859           constant_p = true;
4860         }
4861     }
4862   if (ctor_seq != NULL)
4863     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4864 }
4865
4866 /* For a statement STMT_INFO taking part in a reduction operation return
4867    the stmt_vec_info the meta information is stored on.  */
4868
4869 stmt_vec_info
4870 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4871 {
4872   stmt_info = vect_orig_stmt (stmt_info);
4873   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4874   if (!is_a <gphi *> (stmt_info->stmt)
4875       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4876     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4877   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4878   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4879     {
4880       if (gimple_phi_num_args (phi) == 1)
4881         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4882     }
4883   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4884     {
4885       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4886       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4887         stmt_info = info;
4888     }
4889   return stmt_info;
4890 }
4891
4892 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4893    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4894    return false.  */
4895
4896 static bool
4897 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4898                                 stmt_vec_info reduc_info)
4899 {
4900   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4901   if (!main_loop_vinfo)
4902     return false;
4903
4904   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4905     return false;
4906
4907   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4908   auto_vec<tree, 16> main_loop_results (num_phis);
4909   auto_vec<tree, 16> initial_values (num_phis);
4910   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4911     {
4912       /* The epilogue loop can be entered either from the main loop or
4913          from an earlier guard block.  */
4914       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4915       for (tree incoming_value : reduc_info->reduc_initial_values)
4916         {
4917           /* Look for:
4918
4919                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4920                                     INITIAL_VALUE(guard block)>.  */
4921           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4922
4923           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4924           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4925
4926           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4927           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4928
4929           main_loop_results.quick_push (from_main_loop);
4930           initial_values.quick_push (from_skip);
4931         }
4932     }
4933   else
4934     /* The main loop dominates the epilogue loop.  */
4935     main_loop_results.splice (reduc_info->reduc_initial_values);
4936
4937   /* See if the main loop has the kind of accumulator we need.  */
4938   vect_reusable_accumulator *accumulator
4939     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4940   if (!accumulator
4941       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4942       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4943                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4944     return false;
4945
4946   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4947   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4948   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4949   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4950                             TYPE_VECTOR_SUBPARTS (vectype)))
4951     return false;
4952
4953   /* Non-SLP reductions might apply an adjustment after the reduction
4954      operation, in order to simplify the initialization of the accumulator.
4955      If the epilogue loop carries on from where the main loop left off,
4956      it should apply the same adjustment to the final reduction result.
4957
4958      If the epilogue loop can also be entered directly (rather than via
4959      the main loop), we need to be able to handle that case in the same way,
4960      with the same adjustment.  (In principle we could add a PHI node
4961      to select the correct adjustment, but in practice that shouldn't be
4962      necessary.)  */
4963   tree main_adjustment
4964     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4965   if (loop_vinfo->main_loop_edge && main_adjustment)
4966     {
4967       gcc_assert (num_phis == 1);
4968       tree initial_value = initial_values[0];
4969       /* Check that we can use INITIAL_VALUE as the adjustment and
4970          initialize the accumulator with a neutral value instead.  */
4971       if (!operand_equal_p (initial_value, main_adjustment))
4972         return false;
4973       tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4974       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4975                                                     code, initial_value);
4976     }
4977   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4978   reduc_info->reduc_initial_values.truncate (0);
4979   reduc_info->reduc_initial_values.splice (initial_values);
4980   reduc_info->reused_accumulator = accumulator;
4981   return true;
4982 }
4983
4984 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4985    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
4986
4987 static tree
4988 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4989                             gimple_seq *seq)
4990 {
4991   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4992   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4993   tree stype = TREE_TYPE (vectype);
4994   tree new_temp = vec_def;
4995   while (nunits > nunits1)
4996     {
4997       nunits /= 2;
4998       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4999                                                            stype, nunits);
5000       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5001
5002       /* The target has to make sure we support lowpart/highpart
5003          extraction, either via direct vector extract or through
5004          an integer mode punning.  */
5005       tree dst1, dst2;
5006       gimple *epilog_stmt;
5007       if (convert_optab_handler (vec_extract_optab,
5008                                  TYPE_MODE (TREE_TYPE (new_temp)),
5009                                  TYPE_MODE (vectype1))
5010           != CODE_FOR_nothing)
5011         {
5012           /* Extract sub-vectors directly once vec_extract becomes
5013              a conversion optab.  */
5014           dst1 = make_ssa_name (vectype1);
5015           epilog_stmt
5016               = gimple_build_assign (dst1, BIT_FIELD_REF,
5017                                      build3 (BIT_FIELD_REF, vectype1,
5018                                              new_temp, TYPE_SIZE (vectype1),
5019                                              bitsize_int (0)));
5020           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5021           dst2 =  make_ssa_name (vectype1);
5022           epilog_stmt
5023               = gimple_build_assign (dst2, BIT_FIELD_REF,
5024                                      build3 (BIT_FIELD_REF, vectype1,
5025                                              new_temp, TYPE_SIZE (vectype1),
5026                                              bitsize_int (bitsize)));
5027           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5028         }
5029       else
5030         {
5031           /* Extract via punning to appropriately sized integer mode
5032              vector.  */
5033           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5034           tree etype = build_vector_type (eltype, 2);
5035           gcc_assert (convert_optab_handler (vec_extract_optab,
5036                                              TYPE_MODE (etype),
5037                                              TYPE_MODE (eltype))
5038                       != CODE_FOR_nothing);
5039           tree tem = make_ssa_name (etype);
5040           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5041                                              build1 (VIEW_CONVERT_EXPR,
5042                                                      etype, new_temp));
5043           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5044           new_temp = tem;
5045           tem = make_ssa_name (eltype);
5046           epilog_stmt
5047               = gimple_build_assign (tem, BIT_FIELD_REF,
5048                                      build3 (BIT_FIELD_REF, eltype,
5049                                              new_temp, TYPE_SIZE (eltype),
5050                                              bitsize_int (0)));
5051           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5052           dst1 = make_ssa_name (vectype1);
5053           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5054                                              build1 (VIEW_CONVERT_EXPR,
5055                                                      vectype1, tem));
5056           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5057           tem = make_ssa_name (eltype);
5058           epilog_stmt
5059               = gimple_build_assign (tem, BIT_FIELD_REF,
5060                                      build3 (BIT_FIELD_REF, eltype,
5061                                              new_temp, TYPE_SIZE (eltype),
5062                                              bitsize_int (bitsize)));
5063           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5064           dst2 =  make_ssa_name (vectype1);
5065           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5066                                              build1 (VIEW_CONVERT_EXPR,
5067                                                      vectype1, tem));
5068           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5069         }
5070
5071       new_temp = make_ssa_name (vectype1);
5072       epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5073       gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5074     }
5075
5076   return new_temp;
5077 }
5078
5079 /* Function vect_create_epilog_for_reduction
5080
5081    Create code at the loop-epilog to finalize the result of a reduction
5082    computation.
5083
5084    STMT_INFO is the scalar reduction stmt that is being vectorized.
5085    SLP_NODE is an SLP node containing a group of reduction statements. The
5086      first one in this group is STMT_INFO.
5087    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5088    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5089      (counting from 0)
5090
5091    This function:
5092    1. Completes the reduction def-use cycles.
5093    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5094       by calling the function specified by REDUC_FN if available, or by
5095       other means (whole-vector shifts or a scalar loop).
5096       The function also creates a new phi node at the loop exit to preserve
5097       loop-closed form, as illustrated below.
5098
5099      The flow at the entry to this function:
5100
5101         loop:
5102           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5103           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5104           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5105         loop_exit:
5106           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5107           use <s_out0>
5108           use <s_out0>
5109
5110      The above is transformed by this function into:
5111
5112         loop:
5113           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5114           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5115           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5116         loop_exit:
5117           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5118           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5119           v_out2 = reduce <v_out1>
5120           s_out3 = extract_field <v_out2, 0>
5121           s_out4 = adjust_result <s_out3>
5122           use <s_out4>
5123           use <s_out4>
5124 */
5125
5126 static void
5127 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5128                                   stmt_vec_info stmt_info,
5129                                   slp_tree slp_node,
5130                                   slp_instance slp_node_instance)
5131 {
5132   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5133   gcc_assert (reduc_info->is_reduc_info);
5134   /* For double reductions we need to get at the inner loop reduction
5135      stmt which has the meta info attached.  Our stmt_info is that of the
5136      loop-closed PHI of the inner loop which we remember as
5137      def for the reduction PHI generation.  */
5138   bool double_reduc = false;
5139   stmt_vec_info rdef_info = stmt_info;
5140   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5141     {
5142       gcc_assert (!slp_node);
5143       double_reduc = true;
5144       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5145                                             (stmt_info->stmt, 0));
5146       stmt_info = vect_stmt_to_vectorize (stmt_info);
5147     }
5148   gphi *reduc_def_stmt
5149     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5150   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5151   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5152   tree vectype;
5153   machine_mode mode;
5154   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5155   basic_block exit_bb;
5156   tree scalar_dest;
5157   tree scalar_type;
5158   gimple *new_phi = NULL, *phi;
5159   gimple_stmt_iterator exit_gsi;
5160   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5161   gimple *epilog_stmt = NULL;
5162   gimple *exit_phi;
5163   tree bitsize;
5164   tree def;
5165   tree orig_name, scalar_result;
5166   imm_use_iterator imm_iter, phi_imm_iter;
5167   use_operand_p use_p, phi_use_p;
5168   gimple *use_stmt;
5169   auto_vec<tree> reduc_inputs;
5170   int j, i;
5171   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5172   unsigned int group_size = 1, k;
5173   auto_vec<gimple *> phis;
5174   /* SLP reduction without reduction chain, e.g.,
5175      # a1 = phi <a2, a0>
5176      # b1 = phi <b2, b0>
5177      a2 = operation (a1)
5178      b2 = operation (b1)  */
5179   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5180   bool direct_slp_reduc;
5181   tree induction_index = NULL_TREE;
5182
5183   if (slp_node)
5184     group_size = SLP_TREE_LANES (slp_node);
5185
5186   if (nested_in_vect_loop_p (loop, stmt_info))
5187     {
5188       outer_loop = loop;
5189       loop = loop->inner;
5190       gcc_assert (!slp_node && double_reduc);
5191     }
5192
5193   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5194   gcc_assert (vectype);
5195   mode = TYPE_MODE (vectype);
5196
5197   tree induc_val = NULL_TREE;
5198   tree adjustment_def = NULL;
5199   if (slp_node)
5200     ;
5201   else
5202     {
5203       /* Optimize: for induction condition reduction, if we can't use zero
5204          for induc_val, use initial_def.  */
5205       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5206         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5207       else if (double_reduc)
5208         ;
5209       else
5210         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5211     }
5212
5213   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5214   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5215   if (slp_reduc)
5216     /* All statements produce live-out values.  */
5217     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5218   else if (slp_node)
5219     /* The last statement in the reduction chain produces the live-out
5220        value.  */
5221     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5222
5223   unsigned vec_num;
5224   int ncopies;
5225   if (slp_node)
5226     {
5227       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5228       ncopies = 1;
5229     }
5230   else
5231     {
5232       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5233       vec_num = 1;
5234       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5235     }
5236
5237   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5238      which is updated with the current index of the loop for every match of
5239      the original loop's cond_expr (VEC_STMT).  This results in a vector
5240      containing the last time the condition passed for that vector lane.
5241      The first match will be a 1 to allow 0 to be used for non-matching
5242      indexes.  If there are no matches at all then the vector will be all
5243      zeroes.
5244
5245      PR92772: This algorithm is broken for architectures that support
5246      masked vectors, but do not provide fold_extract_last.  */
5247   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5248     {
5249       auto_vec<std::pair<tree, bool>, 2> ccompares;
5250       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5251       cond_info = vect_stmt_to_vectorize (cond_info);
5252       while (cond_info != reduc_info)
5253         {
5254           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5255             {
5256               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5257               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5258               ccompares.safe_push
5259                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5260                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5261             }
5262           cond_info
5263             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5264                                                  1 + STMT_VINFO_REDUC_IDX
5265                                                         (cond_info)));
5266           cond_info = vect_stmt_to_vectorize (cond_info);
5267         }
5268       gcc_assert (ccompares.length () != 0);
5269
5270       tree indx_before_incr, indx_after_incr;
5271       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5272       int scalar_precision
5273         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5274       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5275       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5276         (TYPE_MODE (vectype), cr_index_scalar_type,
5277          TYPE_VECTOR_SUBPARTS (vectype));
5278
5279       /* First we create a simple vector induction variable which starts
5280          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5281          vector size (STEP).  */
5282
5283       /* Create a {1,2,3,...} vector.  */
5284       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5285
5286       /* Create a vector of the step value.  */
5287       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5288       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5289
5290       /* Create an induction variable.  */
5291       gimple_stmt_iterator incr_gsi;
5292       bool insert_after;
5293       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5294       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5295                  insert_after, &indx_before_incr, &indx_after_incr);
5296
5297       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5298          filled with zeros (VEC_ZERO).  */
5299
5300       /* Create a vector of 0s.  */
5301       tree zero = build_zero_cst (cr_index_scalar_type);
5302       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5303
5304       /* Create a vector phi node.  */
5305       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5306       new_phi = create_phi_node (new_phi_tree, loop->header);
5307       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5308                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5309
5310       /* Now take the condition from the loops original cond_exprs
5311          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5312          every match uses values from the induction variable
5313          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5314          (NEW_PHI_TREE).
5315          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5316          the new cond_expr (INDEX_COND_EXPR).  */
5317       gimple_seq stmts = NULL;
5318       for (int i = ccompares.length () - 1; i != -1; --i)
5319         {
5320           tree ccompare = ccompares[i].first;
5321           if (ccompares[i].second)
5322             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5323                                          cr_index_vector_type,
5324                                          ccompare,
5325                                          indx_before_incr, new_phi_tree);
5326           else
5327             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5328                                          cr_index_vector_type,
5329                                          ccompare,
5330                                          new_phi_tree, indx_before_incr);
5331         }
5332       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5333
5334       /* Update the phi with the vec cond.  */
5335       induction_index = new_phi_tree;
5336       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5337                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5338     }
5339
5340   /* 2. Create epilog code.
5341         The reduction epilog code operates across the elements of the vector
5342         of partial results computed by the vectorized loop.
5343         The reduction epilog code consists of:
5344
5345         step 1: compute the scalar result in a vector (v_out2)
5346         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5347         step 3: adjust the scalar result (s_out3) if needed.
5348
5349         Step 1 can be accomplished using one the following three schemes:
5350           (scheme 1) using reduc_fn, if available.
5351           (scheme 2) using whole-vector shifts, if available.
5352           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5353                      combined.
5354
5355           The overall epilog code looks like this:
5356
5357           s_out0 = phi <s_loop>         # original EXIT_PHI
5358           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5359           v_out2 = reduce <v_out1>              # step 1
5360           s_out3 = extract_field <v_out2, 0>    # step 2
5361           s_out4 = adjust_result <s_out3>       # step 3
5362
5363           (step 3 is optional, and steps 1 and 2 may be combined).
5364           Lastly, the uses of s_out0 are replaced by s_out4.  */
5365
5366
5367   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5368          v_out1 = phi <VECT_DEF>
5369          Store them in NEW_PHIS.  */
5370   if (double_reduc)
5371     loop = outer_loop;
5372   exit_bb = single_exit (loop)->dest;
5373   exit_gsi = gsi_after_labels (exit_bb);
5374   reduc_inputs.create (slp_node ? vec_num : ncopies);
5375   for (unsigned i = 0; i < vec_num; i++)
5376     {
5377       gimple_seq stmts = NULL;
5378       if (slp_node)
5379         def = vect_get_slp_vect_def (slp_node, i);
5380       else
5381         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5382       for (j = 0; j < ncopies; j++)
5383         {
5384           tree new_def = copy_ssa_name (def);
5385           phi = create_phi_node (new_def, exit_bb);
5386           if (j)
5387             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5388           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5389           new_def = gimple_convert (&stmts, vectype, new_def);
5390           reduc_inputs.quick_push (new_def);
5391         }
5392       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5393     }
5394
5395   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5396          (i.e. when reduc_fn is not available) and in the final adjustment
5397          code (if needed).  Also get the original scalar reduction variable as
5398          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5399          represents a reduction pattern), the tree-code and scalar-def are
5400          taken from the original stmt that the pattern-stmt (STMT) replaces.
5401          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5402          are taken from STMT.  */
5403
5404   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5405   if (orig_stmt_info != stmt_info)
5406     {
5407       /* Reduction pattern  */
5408       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5409       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5410     }
5411
5412   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5413   scalar_type = TREE_TYPE (scalar_dest);
5414   scalar_results.create (group_size);
5415   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5416   bitsize = TYPE_SIZE (scalar_type);
5417
5418   /* True if we should implement SLP_REDUC using native reduction operations
5419      instead of scalar operations.  */
5420   direct_slp_reduc = (reduc_fn != IFN_LAST
5421                       && slp_reduc
5422                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5423
5424   /* In case of reduction chain, e.g.,
5425      # a1 = phi <a3, a0>
5426      a2 = operation (a1)
5427      a3 = operation (a2),
5428
5429      we may end up with more than one vector result.  Here we reduce them
5430      to one vector.
5431
5432      The same is true if we couldn't use a single defuse cycle.  */
5433   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5434       || direct_slp_reduc
5435       || ncopies > 1)
5436     {
5437       gimple_seq stmts = NULL;
5438       tree single_input = reduc_inputs[0];
5439       for (k = 1; k < reduc_inputs.length (); k++)
5440         single_input = gimple_build (&stmts, code, vectype,
5441                                      single_input, reduc_inputs[k]);
5442       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5443
5444       reduc_inputs.truncate (0);
5445       reduc_inputs.safe_push (single_input);
5446     }
5447
5448   tree orig_reduc_input = reduc_inputs[0];
5449
5450   /* If this loop is an epilogue loop that can be skipped after the
5451      main loop, we can only share a reduction operation between the
5452      main loop and the epilogue if we put it at the target of the
5453      skip edge.
5454
5455      We can still reuse accumulators if this check fails.  Doing so has
5456      the minor(?) benefit of making the epilogue loop's scalar result
5457      independent of the main loop's scalar result.  */
5458   bool unify_with_main_loop_p = false;
5459   if (reduc_info->reused_accumulator
5460       && loop_vinfo->skip_this_loop_edge
5461       && single_succ_p (exit_bb)
5462       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5463     {
5464       unify_with_main_loop_p = true;
5465
5466       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5467       reduc_inputs[0] = make_ssa_name (vectype);
5468       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5469       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5470                    UNKNOWN_LOCATION);
5471       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5472                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5473       exit_gsi = gsi_after_labels (reduc_block);
5474     }
5475
5476   /* Shouldn't be used beyond this point.  */
5477   exit_bb = nullptr;
5478
5479   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5480       && reduc_fn != IFN_LAST)
5481     {
5482       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5483          various data values where the condition matched and another vector
5484          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5485          need to extract the last matching index (which will be the index with
5486          highest value) and use this to index into the data vector.
5487          For the case where there were no matches, the data vector will contain
5488          all default values and the index vector will be all zeros.  */
5489
5490       /* Get various versions of the type of the vector of indexes.  */
5491       tree index_vec_type = TREE_TYPE (induction_index);
5492       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5493       tree index_scalar_type = TREE_TYPE (index_vec_type);
5494       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5495
5496       /* Get an unsigned integer version of the type of the data vector.  */
5497       int scalar_precision
5498         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5499       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5500       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5501                                                 vectype);
5502
5503       /* First we need to create a vector (ZERO_VEC) of zeros and another
5504          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5505          can create using a MAX reduction and then expanding.
5506          In the case where the loop never made any matches, the max index will
5507          be zero.  */
5508
5509       /* Vector of {0, 0, 0,...}.  */
5510       tree zero_vec = build_zero_cst (vectype);
5511
5512       /* Find maximum value from the vector of found indexes.  */
5513       tree max_index = make_ssa_name (index_scalar_type);
5514       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5515                                                           1, induction_index);
5516       gimple_call_set_lhs (max_index_stmt, max_index);
5517       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5518
5519       /* Vector of {max_index, max_index, max_index,...}.  */
5520       tree max_index_vec = make_ssa_name (index_vec_type);
5521       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5522                                                       max_index);
5523       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5524                                                         max_index_vec_rhs);
5525       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5526
5527       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5528          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5529          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5530          otherwise.  Only one value should match, resulting in a vector
5531          (VEC_COND) with one data value and the rest zeros.
5532          In the case where the loop never made any matches, every index will
5533          match, resulting in a vector with all data values (which will all be
5534          the default value).  */
5535
5536       /* Compare the max index vector to the vector of found indexes to find
5537          the position of the max value.  */
5538       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5539       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5540                                                       induction_index,
5541                                                       max_index_vec);
5542       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5543
5544       /* Use the compare to choose either values from the data vector or
5545          zero.  */
5546       tree vec_cond = make_ssa_name (vectype);
5547       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5548                                                    vec_compare,
5549                                                    reduc_inputs[0],
5550                                                    zero_vec);
5551       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5552
5553       /* Finally we need to extract the data value from the vector (VEC_COND)
5554          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5555          reduction, but because this doesn't exist, we can use a MAX reduction
5556          instead.  The data value might be signed or a float so we need to cast
5557          it first.
5558          In the case where the loop never made any matches, the data values are
5559          all identical, and so will reduce down correctly.  */
5560
5561       /* Make the matched data values unsigned.  */
5562       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5563       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5564                                        vec_cond);
5565       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5566                                                         VIEW_CONVERT_EXPR,
5567                                                         vec_cond_cast_rhs);
5568       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5569
5570       /* Reduce down to a scalar value.  */
5571       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5572       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5573                                                            1, vec_cond_cast);
5574       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5575       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5576
5577       /* Convert the reduced value back to the result type and set as the
5578          result.  */
5579       gimple_seq stmts = NULL;
5580       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5581                                data_reduc);
5582       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5583       scalar_results.safe_push (new_temp);
5584     }
5585   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5586            && reduc_fn == IFN_LAST)
5587     {
5588       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5589          idx = 0;
5590          idx_val = induction_index[0];
5591          val = data_reduc[0];
5592          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5593            if (induction_index[i] > idx_val)
5594              val = data_reduc[i], idx_val = induction_index[i];
5595          return val;  */
5596
5597       tree data_eltype = TREE_TYPE (vectype);
5598       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5599       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5600       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5601       /* Enforced by vectorizable_reduction, which ensures we have target
5602          support before allowing a conditional reduction on variable-length
5603          vectors.  */
5604       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5605       tree idx_val = NULL_TREE, val = NULL_TREE;
5606       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5607         {
5608           tree old_idx_val = idx_val;
5609           tree old_val = val;
5610           idx_val = make_ssa_name (idx_eltype);
5611           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5612                                              build3 (BIT_FIELD_REF, idx_eltype,
5613                                                      induction_index,
5614                                                      bitsize_int (el_size),
5615                                                      bitsize_int (off)));
5616           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617           val = make_ssa_name (data_eltype);
5618           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5619                                              build3 (BIT_FIELD_REF,
5620                                                      data_eltype,
5621                                                      reduc_inputs[0],
5622                                                      bitsize_int (el_size),
5623                                                      bitsize_int (off)));
5624           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5625           if (off != 0)
5626             {
5627               tree new_idx_val = idx_val;
5628               if (off != v_size - el_size)
5629                 {
5630                   new_idx_val = make_ssa_name (idx_eltype);
5631                   epilog_stmt = gimple_build_assign (new_idx_val,
5632                                                      MAX_EXPR, idx_val,
5633                                                      old_idx_val);
5634                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5635                 }
5636               tree new_val = make_ssa_name (data_eltype);
5637               epilog_stmt = gimple_build_assign (new_val,
5638                                                  COND_EXPR,
5639                                                  build2 (GT_EXPR,
5640                                                          boolean_type_node,
5641                                                          idx_val,
5642                                                          old_idx_val),
5643                                                  val, old_val);
5644               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5645               idx_val = new_idx_val;
5646               val = new_val;
5647             }
5648         }
5649       /* Convert the reduced value back to the result type and set as the
5650          result.  */
5651       gimple_seq stmts = NULL;
5652       val = gimple_convert (&stmts, scalar_type, val);
5653       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5654       scalar_results.safe_push (val);
5655     }
5656
5657   /* 2.3 Create the reduction code, using one of the three schemes described
5658          above. In SLP we simply need to extract all the elements from the
5659          vector (without reducing them), so we use scalar shifts.  */
5660   else if (reduc_fn != IFN_LAST && !slp_reduc)
5661     {
5662       tree tmp;
5663       tree vec_elem_type;
5664
5665       /* Case 1:  Create:
5666          v_out2 = reduc_expr <v_out1>  */
5667
5668       if (dump_enabled_p ())
5669         dump_printf_loc (MSG_NOTE, vect_location,
5670                          "Reduce using direct vector reduction.\n");
5671
5672       gimple_seq stmts = NULL;
5673       vec_elem_type = TREE_TYPE (vectype);
5674       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5675                                vec_elem_type, reduc_inputs[0]);
5676       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5677       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5678
5679       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5680           && induc_val)
5681         {
5682           /* Earlier we set the initial value to be a vector if induc_val
5683              values.  Check the result and if it is induc_val then replace
5684              with the original initial value, unless induc_val is
5685              the same as initial_def already.  */
5686           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5687                                   induc_val);
5688           tree initial_def = reduc_info->reduc_initial_values[0];
5689
5690           tmp = make_ssa_name (new_scalar_dest);
5691           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5692                                              initial_def, new_temp);
5693           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5694           new_temp = tmp;
5695         }
5696
5697       scalar_results.safe_push (new_temp);
5698     }
5699   else if (direct_slp_reduc)
5700     {
5701       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5702          with the elements for other SLP statements replaced with the
5703          neutral value.  We can then do a normal reduction on each vector.  */
5704
5705       /* Enforced by vectorizable_reduction.  */
5706       gcc_assert (reduc_inputs.length () == 1);
5707       gcc_assert (pow2p_hwi (group_size));
5708
5709       gimple_seq seq = NULL;
5710
5711       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5712          and the same element size as VECTYPE.  */
5713       tree index = build_index_vector (vectype, 0, 1);
5714       tree index_type = TREE_TYPE (index);
5715       tree index_elt_type = TREE_TYPE (index_type);
5716       tree mask_type = truth_type_for (index_type);
5717
5718       /* Create a vector that, for each element, identifies which of
5719          the REDUC_GROUP_SIZE results should use it.  */
5720       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5721       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5722                             build_vector_from_val (index_type, index_mask));
5723
5724       /* Get a neutral vector value.  This is simply a splat of the neutral
5725          scalar value if we have one, otherwise the initial scalar value
5726          is itself a neutral value.  */
5727       tree vector_identity = NULL_TREE;
5728       tree neutral_op = NULL_TREE;
5729       if (slp_node)
5730         {
5731           tree initial_value = NULL_TREE;
5732           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5733             initial_value = reduc_info->reduc_initial_values[0];
5734           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5735                                                  initial_value);
5736         }
5737       if (neutral_op)
5738         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5739                                                         neutral_op);
5740       for (unsigned int i = 0; i < group_size; ++i)
5741         {
5742           /* If there's no univeral neutral value, we can use the
5743              initial scalar value from the original PHI.  This is used
5744              for MIN and MAX reduction, for example.  */
5745           if (!neutral_op)
5746             {
5747               tree scalar_value = reduc_info->reduc_initial_values[i];
5748               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5749                                              scalar_value);
5750               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5751                                                               scalar_value);
5752             }
5753
5754           /* Calculate the equivalent of:
5755
5756              sel[j] = (index[j] == i);
5757
5758              which selects the elements of REDUC_INPUTS[0] that should
5759              be included in the result.  */
5760           tree compare_val = build_int_cst (index_elt_type, i);
5761           compare_val = build_vector_from_val (index_type, compare_val);
5762           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5763                                    index, compare_val);
5764
5765           /* Calculate the equivalent of:
5766
5767              vec = seq ? reduc_inputs[0] : vector_identity;
5768
5769              VEC is now suitable for a full vector reduction.  */
5770           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5771                                    sel, reduc_inputs[0], vector_identity);
5772
5773           /* Do the reduction and convert it to the appropriate type.  */
5774           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5775                                       TREE_TYPE (vectype), vec);
5776           scalar = gimple_convert (&seq, scalar_type, scalar);
5777           scalar_results.safe_push (scalar);
5778         }
5779       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5780     }
5781   else
5782     {
5783       bool reduce_with_shift;
5784       tree vec_temp;
5785
5786       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5787
5788       /* See if the target wants to do the final (shift) reduction
5789          in a vector mode of smaller size and first reduce upper/lower
5790          halves against each other.  */
5791       enum machine_mode mode1 = mode;
5792       tree stype = TREE_TYPE (vectype);
5793       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5794       unsigned nunits1 = nunits;
5795       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5796           && reduc_inputs.length () == 1)
5797         {
5798           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5799           /* For SLP reductions we have to make sure lanes match up, but
5800              since we're doing individual element final reduction reducing
5801              vector width here is even more important.
5802              ???  We can also separate lanes with permutes, for the common
5803              case of power-of-two group-size odd/even extracts would work.  */
5804           if (slp_reduc && nunits != nunits1)
5805             {
5806               nunits1 = least_common_multiple (nunits1, group_size);
5807               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5808             }
5809         }
5810       if (!slp_reduc
5811           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5812         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5813
5814       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5815                                                            stype, nunits1);
5816       reduce_with_shift = have_whole_vector_shift (mode1);
5817       if (!VECTOR_MODE_P (mode1))
5818         reduce_with_shift = false;
5819       else
5820         {
5821           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5822           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5823             reduce_with_shift = false;
5824         }
5825
5826       /* First reduce the vector to the desired vector size we should
5827          do shift reduction on by combining upper and lower halves.  */
5828       gimple_seq stmts = NULL;
5829       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5830                                              code, &stmts);
5831       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5832       reduc_inputs[0] = new_temp;
5833
5834       if (reduce_with_shift && !slp_reduc)
5835         {
5836           int element_bitsize = tree_to_uhwi (bitsize);
5837           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5838              for variable-length vectors and also requires direct target support
5839              for loop reductions.  */
5840           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5841           int nelements = vec_size_in_bits / element_bitsize;
5842           vec_perm_builder sel;
5843           vec_perm_indices indices;
5844
5845           int elt_offset;
5846
5847           tree zero_vec = build_zero_cst (vectype1);
5848           /* Case 2: Create:
5849              for (offset = nelements/2; offset >= 1; offset/=2)
5850                 {
5851                   Create:  va' = vec_shift <va, offset>
5852                   Create:  va = vop <va, va'>
5853                 }  */
5854
5855           tree rhs;
5856
5857           if (dump_enabled_p ())
5858             dump_printf_loc (MSG_NOTE, vect_location,
5859                              "Reduce using vector shifts\n");
5860
5861           gimple_seq stmts = NULL;
5862           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5863           for (elt_offset = nelements / 2;
5864                elt_offset >= 1;
5865                elt_offset /= 2)
5866             {
5867               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5868               indices.new_vector (sel, 2, nelements);
5869               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5870               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5871                                        new_temp, zero_vec, mask);
5872               new_temp = gimple_build (&stmts, code,
5873                                        vectype1, new_name, new_temp);
5874             }
5875           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5876
5877           /* 2.4  Extract the final scalar result.  Create:
5878              s_out3 = extract_field <v_out2, bitpos>  */
5879
5880           if (dump_enabled_p ())
5881             dump_printf_loc (MSG_NOTE, vect_location,
5882                              "extract scalar result\n");
5883
5884           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5885                         bitsize, bitsize_zero_node);
5886           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5887           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5888           gimple_assign_set_lhs (epilog_stmt, new_temp);
5889           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5890           scalar_results.safe_push (new_temp);
5891         }
5892       else
5893         {
5894           /* Case 3: Create:
5895              s = extract_field <v_out2, 0>
5896              for (offset = element_size;
5897                   offset < vector_size;
5898                   offset += element_size;)
5899                {
5900                  Create:  s' = extract_field <v_out2, offset>
5901                  Create:  s = op <s, s'>  // For non SLP cases
5902                }  */
5903
5904           if (dump_enabled_p ())
5905             dump_printf_loc (MSG_NOTE, vect_location,
5906                              "Reduce using scalar code.\n");
5907
5908           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5909           int element_bitsize = tree_to_uhwi (bitsize);
5910           tree compute_type = TREE_TYPE (vectype);
5911           gimple_seq stmts = NULL;
5912           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5913             {
5914               int bit_offset;
5915               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5916                                        vec_temp, bitsize, bitsize_zero_node);
5917
5918               /* In SLP we don't need to apply reduction operation, so we just
5919                  collect s' values in SCALAR_RESULTS.  */
5920               if (slp_reduc)
5921                 scalar_results.safe_push (new_temp);
5922
5923               for (bit_offset = element_bitsize;
5924                    bit_offset < vec_size_in_bits;
5925                    bit_offset += element_bitsize)
5926                 {
5927                   tree bitpos = bitsize_int (bit_offset);
5928                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5929                                            compute_type, vec_temp,
5930                                            bitsize, bitpos);
5931                   if (slp_reduc)
5932                     {
5933                       /* In SLP we don't need to apply reduction operation, so
5934                          we just collect s' values in SCALAR_RESULTS.  */
5935                       new_temp = new_name;
5936                       scalar_results.safe_push (new_name);
5937                     }
5938                   else
5939                     new_temp = gimple_build (&stmts, code, compute_type,
5940                                              new_name, new_temp);
5941                 }
5942             }
5943
5944           /* The only case where we need to reduce scalar results in SLP, is
5945              unrolling.  If the size of SCALAR_RESULTS is greater than
5946              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5947              REDUC_GROUP_SIZE.  */
5948           if (slp_reduc)
5949             {
5950               tree res, first_res, new_res;
5951
5952               /* Reduce multiple scalar results in case of SLP unrolling.  */
5953               for (j = group_size; scalar_results.iterate (j, &res);
5954                    j++)
5955                 {
5956                   first_res = scalar_results[j % group_size];
5957                   new_res = gimple_build (&stmts, code, compute_type,
5958                                           first_res, res);
5959                   scalar_results[j % group_size] = new_res;
5960                 }
5961               scalar_results.truncate (group_size);
5962               for (k = 0; k < group_size; k++)
5963                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5964                                                     scalar_results[k]);
5965             }
5966           else
5967             {
5968               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5969               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5970               scalar_results.safe_push (new_temp);
5971             }
5972
5973           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5974         }
5975
5976       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5977           && induc_val)
5978         {
5979           /* Earlier we set the initial value to be a vector if induc_val
5980              values.  Check the result and if it is induc_val then replace
5981              with the original initial value, unless induc_val is
5982              the same as initial_def already.  */
5983           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5984                                   induc_val);
5985           tree initial_def = reduc_info->reduc_initial_values[0];
5986
5987           tree tmp = make_ssa_name (new_scalar_dest);
5988           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5989                                              initial_def, new_temp);
5990           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5991           scalar_results[0] = tmp;
5992         }
5993     }
5994
5995   /* 2.5 Adjust the final result by the initial value of the reduction
5996          variable. (When such adjustment is not needed, then
5997          'adjustment_def' is zero).  For example, if code is PLUS we create:
5998          new_temp = loop_exit_def + adjustment_def  */
5999
6000   if (adjustment_def)
6001     {
6002       gcc_assert (!slp_reduc);
6003       gimple_seq stmts = NULL;
6004       if (double_reduc)
6005         {
6006           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6007           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6008           new_temp = gimple_build (&stmts, code, vectype,
6009                                    reduc_inputs[0], adjustment_def);
6010         }
6011       else
6012         {
6013           new_temp = scalar_results[0];
6014           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6015           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6016           new_temp = gimple_build (&stmts, code, scalar_type,
6017                                    new_temp, adjustment_def);
6018         }
6019
6020       epilog_stmt = gimple_seq_last_stmt (stmts);
6021       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6022       scalar_results[0] = new_temp;
6023     }
6024
6025   /* Record this operation if it could be reused by the epilogue loop.  */
6026   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6027     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6028                                            { orig_reduc_input, reduc_info });
6029
6030   if (double_reduc)
6031     loop = outer_loop;
6032
6033   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6034           phis with new adjusted scalar results, i.e., replace use <s_out0>
6035           with use <s_out4>.
6036
6037      Transform:
6038         loop_exit:
6039           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6040           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6041           v_out2 = reduce <v_out1>
6042           s_out3 = extract_field <v_out2, 0>
6043           s_out4 = adjust_result <s_out3>
6044           use <s_out0>
6045           use <s_out0>
6046
6047      into:
6048
6049         loop_exit:
6050           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6051           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6052           v_out2 = reduce <v_out1>
6053           s_out3 = extract_field <v_out2, 0>
6054           s_out4 = adjust_result <s_out3>
6055           use <s_out4>
6056           use <s_out4> */
6057
6058   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6059   for (k = 0; k < live_out_stmts.size (); k++)
6060     {
6061       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6062       scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6063
6064       phis.create (3);
6065       /* Find the loop-closed-use at the loop exit of the original scalar
6066          result.  (The reduction result is expected to have two immediate uses,
6067          one at the latch block, and one at the loop exit).  For double
6068          reductions we are looking for exit phis of the outer loop.  */
6069       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6070         {
6071           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6072             {
6073               if (!is_gimple_debug (USE_STMT (use_p)))
6074                 phis.safe_push (USE_STMT (use_p));
6075             }
6076           else
6077             {
6078               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6079                 {
6080                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6081
6082                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6083                     {
6084                       if (!flow_bb_inside_loop_p (loop,
6085                                              gimple_bb (USE_STMT (phi_use_p)))
6086                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6087                         phis.safe_push (USE_STMT (phi_use_p));
6088                     }
6089                 }
6090             }
6091         }
6092
6093       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6094         {
6095           /* Replace the uses:  */
6096           orig_name = PHI_RESULT (exit_phi);
6097
6098           /* Look for a single use at the target of the skip edge.  */
6099           if (unify_with_main_loop_p)
6100             {
6101               use_operand_p use_p;
6102               gimple *user;
6103               if (!single_imm_use (orig_name, &use_p, &user))
6104                 gcc_unreachable ();
6105               orig_name = gimple_get_lhs (user);
6106             }
6107
6108           scalar_result = scalar_results[k];
6109           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6110             {
6111               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6112                 SET_USE (use_p, scalar_result);
6113               update_stmt (use_stmt);
6114             }
6115         }
6116
6117       phis.release ();
6118     }
6119 }
6120
6121 /* Return a vector of type VECTYPE that is equal to the vector select
6122    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6123    before GSI.  */
6124
6125 static tree
6126 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6127                      tree vec, tree identity)
6128 {
6129   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6130   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6131                                           mask, vec, identity);
6132   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6133   return cond;
6134 }
6135
6136 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6137    order, starting with LHS.  Insert the extraction statements before GSI and
6138    associate the new scalar SSA names with variable SCALAR_DEST.
6139    Return the SSA name for the result.  */
6140
6141 static tree
6142 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6143                        tree_code code, tree lhs, tree vector_rhs)
6144 {
6145   tree vectype = TREE_TYPE (vector_rhs);
6146   tree scalar_type = TREE_TYPE (vectype);
6147   tree bitsize = TYPE_SIZE (scalar_type);
6148   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6149   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6150
6151   for (unsigned HOST_WIDE_INT bit_offset = 0;
6152        bit_offset < vec_size_in_bits;
6153        bit_offset += element_bitsize)
6154     {
6155       tree bitpos = bitsize_int (bit_offset);
6156       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6157                          bitsize, bitpos);
6158
6159       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6160       rhs = make_ssa_name (scalar_dest, stmt);
6161       gimple_assign_set_lhs (stmt, rhs);
6162       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6163
6164       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6165       tree new_name = make_ssa_name (scalar_dest, stmt);
6166       gimple_assign_set_lhs (stmt, new_name);
6167       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6168       lhs = new_name;
6169     }
6170   return lhs;
6171 }
6172
6173 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6174    type of the vector input.  */
6175
6176 static internal_fn
6177 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6178 {
6179   internal_fn mask_reduc_fn;
6180
6181   switch (reduc_fn)
6182     {
6183     case IFN_FOLD_LEFT_PLUS:
6184       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6185       break;
6186
6187     default:
6188       return IFN_LAST;
6189     }
6190
6191   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6192                                       OPTIMIZE_FOR_SPEED))
6193     return mask_reduc_fn;
6194   return IFN_LAST;
6195 }
6196
6197 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6198    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6199    statement.  CODE is the operation performed by STMT_INFO and OPS are
6200    its scalar operands.  REDUC_INDEX is the index of the operand in
6201    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6202    implements in-order reduction, or IFN_LAST if we should open-code it.
6203    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6204    that should be used to control the operation in a fully-masked loop.  */
6205
6206 static bool
6207 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6208                                stmt_vec_info stmt_info,
6209                                gimple_stmt_iterator *gsi,
6210                                gimple **vec_stmt, slp_tree slp_node,
6211                                gimple *reduc_def_stmt,
6212                                tree_code code, internal_fn reduc_fn,
6213                                tree ops[3], tree vectype_in,
6214                                int reduc_index, vec_loop_masks *masks)
6215 {
6216   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6217   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6218   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6219
6220   int ncopies;
6221   if (slp_node)
6222     ncopies = 1;
6223   else
6224     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6225
6226   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6227   gcc_assert (ncopies == 1);
6228   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6229
6230   if (slp_node)
6231     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6232                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6233
6234   tree op0 = ops[1 - reduc_index];
6235
6236   int group_size = 1;
6237   stmt_vec_info scalar_dest_def_info;
6238   auto_vec<tree> vec_oprnds0;
6239   if (slp_node)
6240     {
6241       auto_vec<vec<tree> > vec_defs (2);
6242       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6243       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6244       vec_defs[0].release ();
6245       vec_defs[1].release ();
6246       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6247       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6248     }
6249   else
6250     {
6251       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6252                                      op0, &vec_oprnds0);
6253       scalar_dest_def_info = stmt_info;
6254     }
6255
6256   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6257   tree scalar_type = TREE_TYPE (scalar_dest);
6258   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6259
6260   int vec_num = vec_oprnds0.length ();
6261   gcc_assert (vec_num == 1 || slp_node);
6262   tree vec_elem_type = TREE_TYPE (vectype_out);
6263   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6264
6265   tree vector_identity = NULL_TREE;
6266   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6267     vector_identity = build_zero_cst (vectype_out);
6268
6269   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6270   int i;
6271   tree def0;
6272   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6273     {
6274       gimple *new_stmt;
6275       tree mask = NULL_TREE;
6276       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6277         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6278
6279       /* Handle MINUS by adding the negative.  */
6280       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6281         {
6282           tree negated = make_ssa_name (vectype_out);
6283           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6284           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6285           def0 = negated;
6286         }
6287
6288       if (mask && mask_reduc_fn == IFN_LAST)
6289         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6290                                     vector_identity);
6291
6292       /* On the first iteration the input is simply the scalar phi
6293          result, and for subsequent iterations it is the output of
6294          the preceding operation.  */
6295       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6296         {
6297           if (mask && mask_reduc_fn != IFN_LAST)
6298             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6299                                                    def0, mask);
6300           else
6301             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6302                                                    def0);
6303           /* For chained SLP reductions the output of the previous reduction
6304              operation serves as the input of the next. For the final statement
6305              the output cannot be a temporary - we reuse the original
6306              scalar destination of the last statement.  */
6307           if (i != vec_num - 1)
6308             {
6309               gimple_set_lhs (new_stmt, scalar_dest_var);
6310               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6311               gimple_set_lhs (new_stmt, reduc_var);
6312             }
6313         }
6314       else
6315         {
6316           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6317                                              reduc_var, def0);
6318           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6319           /* Remove the statement, so that we can use the same code paths
6320              as for statements that we've just created.  */
6321           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6322           gsi_remove (&tmp_gsi, true);
6323         }
6324
6325       if (i == vec_num - 1)
6326         {
6327           gimple_set_lhs (new_stmt, scalar_dest);
6328           vect_finish_replace_stmt (loop_vinfo,
6329                                     scalar_dest_def_info,
6330                                     new_stmt);
6331         }
6332       else
6333         vect_finish_stmt_generation (loop_vinfo,
6334                                      scalar_dest_def_info,
6335                                      new_stmt, gsi);
6336
6337       if (slp_node)
6338         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6339       else
6340         {
6341           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6342           *vec_stmt = new_stmt;
6343         }
6344     }
6345
6346   return true;
6347 }
6348
6349 /* Function is_nonwrapping_integer_induction.
6350
6351    Check if STMT_VINO (which is part of loop LOOP) both increments and
6352    does not cause overflow.  */
6353
6354 static bool
6355 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6356 {
6357   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6358   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6359   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6360   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6361   widest_int ni, max_loop_value, lhs_max;
6362   wi::overflow_type overflow = wi::OVF_NONE;
6363
6364   /* Make sure the loop is integer based.  */
6365   if (TREE_CODE (base) != INTEGER_CST
6366       || TREE_CODE (step) != INTEGER_CST)
6367     return false;
6368
6369   /* Check that the max size of the loop will not wrap.  */
6370
6371   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6372     return true;
6373
6374   if (! max_stmt_executions (loop, &ni))
6375     return false;
6376
6377   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6378                             &overflow);
6379   if (overflow)
6380     return false;
6381
6382   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6383                             TYPE_SIGN (lhs_type), &overflow);
6384   if (overflow)
6385     return false;
6386
6387   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6388           <= TYPE_PRECISION (lhs_type));
6389 }
6390
6391 /* Check if masking can be supported by inserting a conditional expression.
6392    CODE is the code for the operation.  COND_FN is the conditional internal
6393    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6394 static bool
6395 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6396                          tree vectype_in)
6397 {
6398   if (cond_fn != IFN_LAST
6399       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6400                                          OPTIMIZE_FOR_SPEED))
6401     return false;
6402
6403   switch (code)
6404     {
6405     case DOT_PROD_EXPR:
6406     case SAD_EXPR:
6407       return true;
6408
6409     default:
6410       return false;
6411     }
6412 }
6413
6414 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6415    code for the operation.  VOP is the array of operands.  MASK is the loop
6416    mask.  GSI is a statement iterator used to place the new conditional
6417    expression.  */
6418 static void
6419 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6420                       gimple_stmt_iterator *gsi)
6421 {
6422   switch (code)
6423     {
6424     case DOT_PROD_EXPR:
6425       {
6426         tree vectype = TREE_TYPE (vop[1]);
6427         tree zero = build_zero_cst (vectype);
6428         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6429         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6430                                                mask, vop[1], zero);
6431         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6432         vop[1] = masked_op1;
6433         break;
6434       }
6435
6436     case SAD_EXPR:
6437       {
6438         tree vectype = TREE_TYPE (vop[1]);
6439         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6440         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6441                                                mask, vop[1], vop[0]);
6442         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6443         vop[1] = masked_op1;
6444         break;
6445       }
6446
6447     default:
6448       gcc_unreachable ();
6449     }
6450 }
6451
6452 /* Function vectorizable_reduction.
6453
6454    Check if STMT_INFO performs a reduction operation that can be vectorized.
6455    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6456    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6457    Return true if STMT_INFO is vectorizable in this way.
6458
6459    This function also handles reduction idioms (patterns) that have been
6460    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6461    may be of this form:
6462      X = pattern_expr (arg0, arg1, ..., X)
6463    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6464    sequence that had been detected and replaced by the pattern-stmt
6465    (STMT_INFO).
6466
6467    This function also handles reduction of condition expressions, for example:
6468      for (int i = 0; i < N; i++)
6469        if (a[i] < value)
6470          last = a[i];
6471    This is handled by vectorising the loop and creating an additional vector
6472    containing the loop indexes for which "a[i] < value" was true.  In the
6473    function epilogue this is reduced to a single max value and then used to
6474    index into the vector of results.
6475
6476    In some cases of reduction patterns, the type of the reduction variable X is
6477    different than the type of the other arguments of STMT_INFO.
6478    In such cases, the vectype that is used when transforming STMT_INFO into
6479    a vector stmt is different than the vectype that is used to determine the
6480    vectorization factor, because it consists of a different number of elements
6481    than the actual number of elements that are being operated upon in parallel.
6482
6483    For example, consider an accumulation of shorts into an int accumulator.
6484    On some targets it's possible to vectorize this pattern operating on 8
6485    shorts at a time (hence, the vectype for purposes of determining the
6486    vectorization factor should be V8HI); on the other hand, the vectype that
6487    is used to create the vector form is actually V4SI (the type of the result).
6488
6489    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6490    indicates what is the actual level of parallelism (V8HI in the example), so
6491    that the right vectorization factor would be derived.  This vectype
6492    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6493    be used to create the vectorized stmt.  The right vectype for the vectorized
6494    stmt is obtained from the type of the result X:
6495       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6496
6497    This means that, contrary to "regular" reductions (or "regular" stmts in
6498    general), the following equation:
6499       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6500    does *NOT* necessarily hold for reduction patterns.  */
6501
6502 bool
6503 vectorizable_reduction (loop_vec_info loop_vinfo,
6504                         stmt_vec_info stmt_info, slp_tree slp_node,
6505                         slp_instance slp_node_instance,
6506                         stmt_vector_for_cost *cost_vec)
6507 {
6508   tree scalar_dest;
6509   tree vectype_in = NULL_TREE;
6510   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6511   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6512   stmt_vec_info cond_stmt_vinfo = NULL;
6513   tree scalar_type;
6514   int i;
6515   int ncopies;
6516   bool single_defuse_cycle = false;
6517   bool nested_cycle = false;
6518   bool double_reduc = false;
6519   int vec_num;
6520   tree tem;
6521   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6522   tree cond_reduc_val = NULL_TREE;
6523
6524   /* Make sure it was already recognized as a reduction computation.  */
6525   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6526       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6527       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6528     return false;
6529
6530   /* The stmt we store reduction analysis meta on.  */
6531   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6532   reduc_info->is_reduc_info = true;
6533
6534   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6535     {
6536       if (is_a <gphi *> (stmt_info->stmt))
6537         {
6538           if (slp_node)
6539             {
6540               /* We eventually need to set a vector type on invariant
6541                  arguments.  */
6542               unsigned j;
6543               slp_tree child;
6544               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6545                 if (!vect_maybe_update_slp_op_vectype
6546                        (child, SLP_TREE_VECTYPE (slp_node)))
6547                   {
6548                     if (dump_enabled_p ())
6549                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550                                        "incompatible vector types for "
6551                                        "invariants\n");
6552                     return false;
6553                   }
6554             }
6555           /* Analysis for double-reduction is done on the outer
6556              loop PHI, nested cycles have no further restrictions.  */
6557           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6558         }
6559       else
6560         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6561       return true;
6562     }
6563
6564   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6565   stmt_vec_info phi_info = stmt_info;
6566   if (!is_a <gphi *> (stmt_info->stmt))
6567     {
6568       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6569       return true;
6570     }
6571   if (slp_node)
6572     {
6573       slp_node_instance->reduc_phis = slp_node;
6574       /* ???  We're leaving slp_node to point to the PHIs, we only
6575          need it to get at the number of vector stmts which wasn't
6576          yet initialized for the instance root.  */
6577     }
6578   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6579     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6580   else
6581     {
6582       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6583                   == vect_double_reduction_def);
6584       use_operand_p use_p;
6585       gimple *use_stmt;
6586       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6587                                  &use_p, &use_stmt);
6588       gcc_assert (res);
6589       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6590       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6591     }
6592
6593   /* PHIs should not participate in patterns.  */
6594   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6595   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6596
6597   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6598      and compute the reduction chain length.  Discover the real
6599      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6600   tree reduc_def
6601     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6602                              loop_latch_edge
6603                                (gimple_bb (reduc_def_phi)->loop_father));
6604   unsigned reduc_chain_length = 0;
6605   bool only_slp_reduc_chain = true;
6606   stmt_info = NULL;
6607   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6608   while (reduc_def != PHI_RESULT (reduc_def_phi))
6609     {
6610       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6611       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6612       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6613         {
6614           if (dump_enabled_p ())
6615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6616                              "reduction chain broken by patterns.\n");
6617           return false;
6618         }
6619       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6620         only_slp_reduc_chain = false;
6621       /* ???  For epilogue generation live members of the chain need
6622          to point back to the PHI via their original stmt for
6623          info_for_reduction to work.  */
6624       if (STMT_VINFO_LIVE_P (vdef))
6625         STMT_VINFO_REDUC_DEF (def) = phi_info;
6626       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6627       if (!assign)
6628         {
6629           if (dump_enabled_p ())
6630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6631                              "reduction chain includes calls.\n");
6632           return false;
6633         }
6634       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6635         {
6636           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6637                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6638             {
6639               if (dump_enabled_p ())
6640                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6641                                  "conversion in the reduction chain.\n");
6642               return false;
6643             }
6644         }
6645       else if (!stmt_info)
6646         /* First non-conversion stmt.  */
6647         stmt_info = vdef;
6648       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6649       reduc_chain_length++;
6650       if (!stmt_info && slp_node)
6651         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6652     }
6653   /* PHIs should not participate in patterns.  */
6654   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6655
6656   if (nested_in_vect_loop_p (loop, stmt_info))
6657     {
6658       loop = loop->inner;
6659       nested_cycle = true;
6660     }
6661
6662   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6663      element.  */
6664   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6665     {
6666       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6667       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6668     }
6669   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6670     gcc_assert (slp_node
6671                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6672
6673   /* 1. Is vectorizable reduction?  */
6674   /* Not supportable if the reduction variable is used in the loop, unless
6675      it's a reduction chain.  */
6676   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6677       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6678     return false;
6679
6680   /* Reductions that are not used even in an enclosing outer-loop,
6681      are expected to be "live" (used out of the loop).  */
6682   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6683       && !STMT_VINFO_LIVE_P (stmt_info))
6684     return false;
6685
6686   /* 2. Has this been recognized as a reduction pattern?
6687
6688      Check if STMT represents a pattern that has been recognized
6689      in earlier analysis stages.  For stmts that represent a pattern,
6690      the STMT_VINFO_RELATED_STMT field records the last stmt in
6691      the original sequence that constitutes the pattern.  */
6692
6693   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6694   if (orig_stmt_info)
6695     {
6696       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6697       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6698     }
6699
6700   /* 3. Check the operands of the operation.  The first operands are defined
6701         inside the loop body. The last operand is the reduction variable,
6702         which is defined by the loop-header-phi.  */
6703
6704   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6705   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6706   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6707   enum tree_code code = gimple_assign_rhs_code (stmt);
6708   bool lane_reduc_code_p
6709     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6710   int op_type = TREE_CODE_LENGTH (code);
6711   enum optab_subtype optab_query_kind = optab_vector;
6712   if (code == DOT_PROD_EXPR
6713       && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6714            != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6715     optab_query_kind = optab_vector_mixed_sign;
6716
6717
6718   scalar_dest = gimple_assign_lhs (stmt);
6719   scalar_type = TREE_TYPE (scalar_dest);
6720   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6721       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6722     return false;
6723
6724   /* Do not try to vectorize bit-precision reductions.  */
6725   if (!type_has_mode_precision_p (scalar_type))
6726     return false;
6727
6728   /* For lane-reducing ops we're reducing the number of reduction PHIs
6729      which means the only use of that may be in the lane-reducing operation.  */
6730   if (lane_reduc_code_p
6731       && reduc_chain_length != 1
6732       && !only_slp_reduc_chain)
6733     {
6734       if (dump_enabled_p ())
6735         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736                          "lane-reducing reduction with extra stmts.\n");
6737       return false;
6738     }
6739
6740   /* All uses but the last are expected to be defined in the loop.
6741      The last use is the reduction variable.  In case of nested cycle this
6742      assumption is not true: we use reduc_index to record the index of the
6743      reduction variable.  */
6744   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6745   /* We need to skip an extra operand for COND_EXPRs with embedded
6746      comparison.  */
6747   unsigned opno_adjust = 0;
6748   if (code == COND_EXPR
6749       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6750     opno_adjust = 1;
6751   for (i = 0; i < op_type; i++)
6752     {
6753       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6754       if (i == 0 && code == COND_EXPR)
6755         continue;
6756
6757       stmt_vec_info def_stmt_info;
6758       enum vect_def_type dt;
6759       tree op;
6760       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6761                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6762                                &def_stmt_info))
6763         {
6764           if (dump_enabled_p ())
6765             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6766                              "use not simple.\n");
6767           return false;
6768         }
6769       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6770         continue;
6771
6772       /* There should be only one cycle def in the stmt, the one
6773          leading to reduc_def.  */
6774       if (VECTORIZABLE_CYCLE_DEF (dt))
6775         return false;
6776
6777       /* To properly compute ncopies we are interested in the widest
6778          non-reduction input type in case we're looking at a widening
6779          accumulation that we later handle in vect_transform_reduction.  */
6780       if (lane_reduc_code_p
6781           && tem
6782           && (!vectype_in
6783               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6784                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6785         vectype_in = tem;
6786
6787       if (code == COND_EXPR)
6788         {
6789           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6790           if (dt == vect_constant_def)
6791             {
6792               cond_reduc_dt = dt;
6793               cond_reduc_val = op;
6794             }
6795           if (dt == vect_induction_def
6796               && def_stmt_info
6797               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6798             {
6799               cond_reduc_dt = dt;
6800               cond_stmt_vinfo = def_stmt_info;
6801             }
6802         }
6803     }
6804   if (!vectype_in)
6805     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6806   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6807
6808   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6809   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6810   /* If we have a condition reduction, see if we can simplify it further.  */
6811   if (v_reduc_type == COND_REDUCTION)
6812     {
6813       if (slp_node)
6814         return false;
6815
6816       /* When the condition uses the reduction value in the condition, fail.  */
6817       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6818         {
6819           if (dump_enabled_p ())
6820             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6821                              "condition depends on previous iteration\n");
6822           return false;
6823         }
6824
6825       if (reduc_chain_length == 1
6826           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6827                                              vectype_in, OPTIMIZE_FOR_SPEED))
6828         {
6829           if (dump_enabled_p ())
6830             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6831                              "optimizing condition reduction with"
6832                              " FOLD_EXTRACT_LAST.\n");
6833           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6834         }
6835       else if (cond_reduc_dt == vect_induction_def)
6836         {
6837           tree base
6838             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6839           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6840
6841           gcc_assert (TREE_CODE (base) == INTEGER_CST
6842                       && TREE_CODE (step) == INTEGER_CST);
6843           cond_reduc_val = NULL_TREE;
6844           enum tree_code cond_reduc_op_code = ERROR_MARK;
6845           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6846           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6847             ;
6848           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6849              above base; punt if base is the minimum value of the type for
6850              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6851           else if (tree_int_cst_sgn (step) == -1)
6852             {
6853               cond_reduc_op_code = MIN_EXPR;
6854               if (tree_int_cst_sgn (base) == -1)
6855                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6856               else if (tree_int_cst_lt (base,
6857                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6858                 cond_reduc_val
6859                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6860             }
6861           else
6862             {
6863               cond_reduc_op_code = MAX_EXPR;
6864               if (tree_int_cst_sgn (base) == 1)
6865                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6866               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6867                                         base))
6868                 cond_reduc_val
6869                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6870             }
6871           if (cond_reduc_val)
6872             {
6873               if (dump_enabled_p ())
6874                 dump_printf_loc (MSG_NOTE, vect_location,
6875                                  "condition expression based on "
6876                                  "integer induction.\n");
6877               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6878               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6879                 = cond_reduc_val;
6880               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6881             }
6882         }
6883       else if (cond_reduc_dt == vect_constant_def)
6884         {
6885           enum vect_def_type cond_initial_dt;
6886           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6887           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6888           if (cond_initial_dt == vect_constant_def
6889               && types_compatible_p (TREE_TYPE (cond_initial_val),
6890                                      TREE_TYPE (cond_reduc_val)))
6891             {
6892               tree e = fold_binary (LE_EXPR, boolean_type_node,
6893                                     cond_initial_val, cond_reduc_val);
6894               if (e && (integer_onep (e) || integer_zerop (e)))
6895                 {
6896                   if (dump_enabled_p ())
6897                     dump_printf_loc (MSG_NOTE, vect_location,
6898                                      "condition expression based on "
6899                                      "compile time constant.\n");
6900                   /* Record reduction code at analysis stage.  */
6901                   STMT_VINFO_REDUC_CODE (reduc_info)
6902                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6903                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6904                 }
6905             }
6906         }
6907     }
6908
6909   if (STMT_VINFO_LIVE_P (phi_info))
6910     return false;
6911
6912   if (slp_node)
6913     ncopies = 1;
6914   else
6915     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6916
6917   gcc_assert (ncopies >= 1);
6918
6919   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6920
6921   if (nested_cycle)
6922     {
6923       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6924                   == vect_double_reduction_def);
6925       double_reduc = true;
6926     }
6927
6928   /* 4.2. Check support for the epilog operation.
6929
6930           If STMT represents a reduction pattern, then the type of the
6931           reduction variable may be different than the type of the rest
6932           of the arguments.  For example, consider the case of accumulation
6933           of shorts into an int accumulator; The original code:
6934                         S1: int_a = (int) short_a;
6935           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6936
6937           was replaced with:
6938                         STMT: int_acc = widen_sum <short_a, int_acc>
6939
6940           This means that:
6941           1. The tree-code that is used to create the vector operation in the
6942              epilog code (that reduces the partial results) is not the
6943              tree-code of STMT, but is rather the tree-code of the original
6944              stmt from the pattern that STMT is replacing.  I.e, in the example
6945              above we want to use 'widen_sum' in the loop, but 'plus' in the
6946              epilog.
6947           2. The type (mode) we use to check available target support
6948              for the vector operation to be created in the *epilog*, is
6949              determined by the type of the reduction variable (in the example
6950              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6951              However the type (mode) we use to check available target support
6952              for the vector operation to be created *inside the loop*, is
6953              determined by the type of the other arguments to STMT (in the
6954              example we'd check this: optab_handler (widen_sum_optab,
6955              vect_short_mode)).
6956
6957           This is contrary to "regular" reductions, in which the types of all
6958           the arguments are the same as the type of the reduction variable.
6959           For "regular" reductions we can therefore use the same vector type
6960           (and also the same tree-code) when generating the epilog code and
6961           when generating the code inside the loop.  */
6962
6963   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6964   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6965
6966   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6967   if (reduction_type == TREE_CODE_REDUCTION)
6968     {
6969       /* Check whether it's ok to change the order of the computation.
6970          Generally, when vectorizing a reduction we change the order of the
6971          computation.  This may change the behavior of the program in some
6972          cases, so we need to check that this is ok.  One exception is when
6973          vectorizing an outer-loop: the inner-loop is executed sequentially,
6974          and therefore vectorizing reductions in the inner-loop during
6975          outer-loop vectorization is safe.  Likewise when we are vectorizing
6976          a series of reductions using SLP and the VF is one the reductions
6977          are performed in scalar order.  */
6978       if (slp_node
6979           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6980           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6981         ;
6982       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6983         {
6984           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6985              is not directy used in stmt.  */
6986           if (!only_slp_reduc_chain
6987               && reduc_chain_length != 1)
6988             {
6989               if (dump_enabled_p ())
6990                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991                                  "in-order reduction chain without SLP.\n");
6992               return false;
6993             }
6994           STMT_VINFO_REDUC_TYPE (reduc_info)
6995             = reduction_type = FOLD_LEFT_REDUCTION;
6996         }
6997       else if (!commutative_tree_code (orig_code)
6998                || !associative_tree_code (orig_code))
6999         {
7000           if (dump_enabled_p ())
7001             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002                             "reduction: not commutative/associative");
7003           return false;
7004         }
7005     }
7006
7007   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7008       && ncopies > 1)
7009     {
7010       if (dump_enabled_p ())
7011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7012                          "multiple types in double reduction or condition "
7013                          "reduction or fold-left reduction.\n");
7014       return false;
7015     }
7016
7017   internal_fn reduc_fn = IFN_LAST;
7018   if (reduction_type == TREE_CODE_REDUCTION
7019       || reduction_type == FOLD_LEFT_REDUCTION
7020       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7021       || reduction_type == CONST_COND_REDUCTION)
7022     {
7023       if (reduction_type == FOLD_LEFT_REDUCTION
7024           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7025           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7026         {
7027           if (reduc_fn != IFN_LAST
7028               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7029                                                   OPTIMIZE_FOR_SPEED))
7030             {
7031               if (dump_enabled_p ())
7032                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7033                                  "reduc op not supported by target.\n");
7034
7035               reduc_fn = IFN_LAST;
7036             }
7037         }
7038       else
7039         {
7040           if (!nested_cycle || double_reduc)
7041             {
7042               if (dump_enabled_p ())
7043                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044                                  "no reduc code for scalar code.\n");
7045
7046               return false;
7047             }
7048         }
7049     }
7050   else if (reduction_type == COND_REDUCTION)
7051     {
7052       int scalar_precision
7053         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7054       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7055       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7056                                                 vectype_out);
7057
7058       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7059                                           OPTIMIZE_FOR_SPEED))
7060         reduc_fn = IFN_REDUC_MAX;
7061     }
7062   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7063
7064   if (reduction_type != EXTRACT_LAST_REDUCTION
7065       && (!nested_cycle || double_reduc)
7066       && reduc_fn == IFN_LAST
7067       && !nunits_out.is_constant ())
7068     {
7069       if (dump_enabled_p ())
7070         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7071                          "missing target support for reduction on"
7072                          " variable-length vectors.\n");
7073       return false;
7074     }
7075
7076   /* For SLP reductions, see if there is a neutral value we can use.  */
7077   tree neutral_op = NULL_TREE;
7078   if (slp_node)
7079     {
7080       tree initial_value = NULL_TREE;
7081       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7082         initial_value = vect_phi_initial_value (reduc_def_phi);
7083       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7084                                              orig_code, initial_value);
7085     }
7086
7087   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7088     {
7089       /* We can't support in-order reductions of code such as this:
7090
7091            for (int i = 0; i < n1; ++i)
7092              for (int j = 0; j < n2; ++j)
7093                l += a[j];
7094
7095          since GCC effectively transforms the loop when vectorizing:
7096
7097            for (int i = 0; i < n1 / VF; ++i)
7098              for (int j = 0; j < n2; ++j)
7099                for (int k = 0; k < VF; ++k)
7100                  l += a[j];
7101
7102          which is a reassociation of the original operation.  */
7103       if (dump_enabled_p ())
7104         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105                          "in-order double reduction not supported.\n");
7106
7107       return false;
7108     }
7109
7110   if (reduction_type == FOLD_LEFT_REDUCTION
7111       && slp_node
7112       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7113     {
7114       /* We cannot use in-order reductions in this case because there is
7115          an implicit reassociation of the operations involved.  */
7116       if (dump_enabled_p ())
7117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7118                          "in-order unchained SLP reductions not supported.\n");
7119       return false;
7120     }
7121
7122   /* For double reductions, and for SLP reductions with a neutral value,
7123      we construct a variable-length initial vector by loading a vector
7124      full of the neutral value and then shift-and-inserting the start
7125      values into the low-numbered elements.  */
7126   if ((double_reduc || neutral_op)
7127       && !nunits_out.is_constant ()
7128       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7129                                           vectype_out, OPTIMIZE_FOR_SPEED))
7130     {
7131       if (dump_enabled_p ())
7132         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7133                          "reduction on variable-length vectors requires"
7134                          " target support for a vector-shift-and-insert"
7135                          " operation.\n");
7136       return false;
7137     }
7138
7139   /* Check extra constraints for variable-length unchained SLP reductions.  */
7140   if (STMT_SLP_TYPE (stmt_info)
7141       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7142       && !nunits_out.is_constant ())
7143     {
7144       /* We checked above that we could build the initial vector when
7145          there's a neutral element value.  Check here for the case in
7146          which each SLP statement has its own initial value and in which
7147          that value needs to be repeated for every instance of the
7148          statement within the initial vector.  */
7149       unsigned int group_size = SLP_TREE_LANES (slp_node);
7150       if (!neutral_op
7151           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7152                                               TREE_TYPE (vectype_out)))
7153         {
7154           if (dump_enabled_p ())
7155             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7156                              "unsupported form of SLP reduction for"
7157                              " variable-length vectors: cannot build"
7158                              " initial vector.\n");
7159           return false;
7160         }
7161       /* The epilogue code relies on the number of elements being a multiple
7162          of the group size.  The duplicate-and-interleave approach to setting
7163          up the initial vector does too.  */
7164       if (!multiple_p (nunits_out, group_size))
7165         {
7166           if (dump_enabled_p ())
7167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168                              "unsupported form of SLP reduction for"
7169                              " variable-length vectors: the vector size"
7170                              " is not a multiple of the number of results.\n");
7171           return false;
7172         }
7173     }
7174
7175   if (reduction_type == COND_REDUCTION)
7176     {
7177       widest_int ni;
7178
7179       if (! max_loop_iterations (loop, &ni))
7180         {
7181           if (dump_enabled_p ())
7182             dump_printf_loc (MSG_NOTE, vect_location,
7183                              "loop count not known, cannot create cond "
7184                              "reduction.\n");
7185           return false;
7186         }
7187       /* Convert backedges to iterations.  */
7188       ni += 1;
7189
7190       /* The additional index will be the same type as the condition.  Check
7191          that the loop can fit into this less one (because we'll use up the
7192          zero slot for when there are no matches).  */
7193       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7194       if (wi::geu_p (ni, wi::to_widest (max_index)))
7195         {
7196           if (dump_enabled_p ())
7197             dump_printf_loc (MSG_NOTE, vect_location,
7198                              "loop size is greater than data size.\n");
7199           return false;
7200         }
7201     }
7202
7203   /* In case the vectorization factor (VF) is bigger than the number
7204      of elements that we can fit in a vectype (nunits), we have to generate
7205      more than one vector stmt - i.e - we need to "unroll" the
7206      vector stmt by a factor VF/nunits.  For more details see documentation
7207      in vectorizable_operation.  */
7208
7209   /* If the reduction is used in an outer loop we need to generate
7210      VF intermediate results, like so (e.g. for ncopies=2):
7211         r0 = phi (init, r0)
7212         r1 = phi (init, r1)
7213         r0 = x0 + r0;
7214         r1 = x1 + r1;
7215     (i.e. we generate VF results in 2 registers).
7216     In this case we have a separate def-use cycle for each copy, and therefore
7217     for each copy we get the vector def for the reduction variable from the
7218     respective phi node created for this copy.
7219
7220     Otherwise (the reduction is unused in the loop nest), we can combine
7221     together intermediate results, like so (e.g. for ncopies=2):
7222         r = phi (init, r)
7223         r = x0 + r;
7224         r = x1 + r;
7225    (i.e. we generate VF/2 results in a single register).
7226    In this case for each copy we get the vector def for the reduction variable
7227    from the vectorized reduction operation generated in the previous iteration.
7228
7229    This only works when we see both the reduction PHI and its only consumer
7230    in vectorizable_reduction and there are no intermediate stmts
7231    participating.  */
7232   if (ncopies > 1
7233       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7234       && reduc_chain_length == 1)
7235     single_defuse_cycle = true;
7236
7237   if (single_defuse_cycle || lane_reduc_code_p)
7238     {
7239       gcc_assert (code != COND_EXPR);
7240
7241       /* 4. Supportable by target?  */
7242       bool ok = true;
7243
7244       /* 4.1. check support for the operation in the loop  */
7245       optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7246       if (!optab)
7247         {
7248           if (dump_enabled_p ())
7249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7250                              "no optab.\n");
7251           ok = false;
7252         }
7253
7254       machine_mode vec_mode = TYPE_MODE (vectype_in);
7255       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7256         {
7257           if (dump_enabled_p ())
7258             dump_printf (MSG_NOTE, "op not supported by target.\n");
7259           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7260               || !vect_can_vectorize_without_simd_p (code))
7261             ok = false;
7262           else
7263             if (dump_enabled_p ())
7264               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7265         }
7266
7267       if (vect_emulated_vector_p (vectype_in)
7268           && !vect_can_vectorize_without_simd_p (code))
7269         {
7270           if (dump_enabled_p ())
7271             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7272           return false;
7273         }
7274
7275       /* lane-reducing operations have to go through vect_transform_reduction.
7276          For the other cases try without the single cycle optimization.  */
7277       if (!ok)
7278         {
7279           if (lane_reduc_code_p)
7280             return false;
7281           else
7282             single_defuse_cycle = false;
7283         }
7284     }
7285   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7286
7287   /* If the reduction stmt is one of the patterns that have lane
7288      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7289   if ((ncopies > 1 && ! single_defuse_cycle)
7290       && lane_reduc_code_p)
7291     {
7292       if (dump_enabled_p ())
7293         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7294                          "multi def-use cycle not possible for lane-reducing "
7295                          "reduction operation\n");
7296       return false;
7297     }
7298
7299   if (slp_node
7300       && !(!single_defuse_cycle
7301            && code != DOT_PROD_EXPR
7302            && code != WIDEN_SUM_EXPR
7303            && code != SAD_EXPR
7304            && reduction_type != FOLD_LEFT_REDUCTION))
7305     for (i = 0; i < op_type; i++)
7306       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7307         {
7308           if (dump_enabled_p ())
7309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310                              "incompatible vector types for invariants\n");
7311           return false;
7312         }
7313
7314   if (slp_node)
7315     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7316   else
7317     vec_num = 1;
7318
7319   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7320                              reduction_type, ncopies, cost_vec);
7321   /* Cost the reduction op inside the loop if transformed via
7322      vect_transform_reduction.  Otherwise this is costed by the
7323      separate vectorizable_* routines.  */
7324   if (single_defuse_cycle
7325       || code == DOT_PROD_EXPR
7326       || code == WIDEN_SUM_EXPR
7327       || code == SAD_EXPR)
7328     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7329
7330   if (dump_enabled_p ()
7331       && reduction_type == FOLD_LEFT_REDUCTION)
7332     dump_printf_loc (MSG_NOTE, vect_location,
7333                      "using an in-order (fold-left) reduction.\n");
7334   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7335   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7336      reductions go through their own vectorizable_* routines.  */
7337   if (!single_defuse_cycle
7338       && code != DOT_PROD_EXPR
7339       && code != WIDEN_SUM_EXPR
7340       && code != SAD_EXPR
7341       && reduction_type != FOLD_LEFT_REDUCTION)
7342     {
7343       stmt_vec_info tem
7344         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7345       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7346         {
7347           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7348           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7349         }
7350       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7351       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7352     }
7353   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7354     {
7355       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7356       internal_fn cond_fn = get_conditional_internal_fn (code);
7357
7358       if (reduction_type != FOLD_LEFT_REDUCTION
7359           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7360           && (cond_fn == IFN_LAST
7361               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7362                                                   OPTIMIZE_FOR_SPEED)))
7363         {
7364           if (dump_enabled_p ())
7365             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7366                              "can't operate on partial vectors because"
7367                              " no conditional operation is available.\n");
7368           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7369         }
7370       else if (reduction_type == FOLD_LEFT_REDUCTION
7371                && reduc_fn == IFN_LAST
7372                && !expand_vec_cond_expr_p (vectype_in,
7373                                            truth_type_for (vectype_in),
7374                                            SSA_NAME))
7375         {
7376           if (dump_enabled_p ())
7377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7378                              "can't operate on partial vectors because"
7379                              " no conditional operation is available.\n");
7380           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7381         }
7382       else
7383         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7384                                vectype_in, NULL);
7385     }
7386   return true;
7387 }
7388
7389 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7390    value.  */
7391
7392 bool
7393 vect_transform_reduction (loop_vec_info loop_vinfo,
7394                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7395                           gimple **vec_stmt, slp_tree slp_node)
7396 {
7397   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7398   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7399   int i;
7400   int ncopies;
7401   int vec_num;
7402
7403   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7404   gcc_assert (reduc_info->is_reduc_info);
7405
7406   if (nested_in_vect_loop_p (loop, stmt_info))
7407     {
7408       loop = loop->inner;
7409       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7410     }
7411
7412   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7413   enum tree_code code = gimple_assign_rhs_code (stmt);
7414   int op_type = TREE_CODE_LENGTH (code);
7415
7416   /* Flatten RHS.  */
7417   tree ops[3];
7418   switch (get_gimple_rhs_class (code))
7419     {
7420     case GIMPLE_TERNARY_RHS:
7421       ops[2] = gimple_assign_rhs3 (stmt);
7422       /* Fall thru.  */
7423     case GIMPLE_BINARY_RHS:
7424       ops[0] = gimple_assign_rhs1 (stmt);
7425       ops[1] = gimple_assign_rhs2 (stmt);
7426       break;
7427     default:
7428       gcc_unreachable ();
7429     }
7430
7431   /* All uses but the last are expected to be defined in the loop.
7432      The last use is the reduction variable.  In case of nested cycle this
7433      assumption is not true: we use reduc_index to record the index of the
7434      reduction variable.  */
7435   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7436   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7437   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7438   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7439
7440   if (slp_node)
7441     {
7442       ncopies = 1;
7443       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7444     }
7445   else
7446     {
7447       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7448       vec_num = 1;
7449     }
7450
7451   internal_fn cond_fn = get_conditional_internal_fn (code);
7452   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7453   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7454
7455   /* Transform.  */
7456   tree new_temp = NULL_TREE;
7457   auto_vec<tree> vec_oprnds0;
7458   auto_vec<tree> vec_oprnds1;
7459   auto_vec<tree> vec_oprnds2;
7460   tree def0;
7461
7462   if (dump_enabled_p ())
7463     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7464
7465   /* FORNOW: Multiple types are not supported for condition.  */
7466   if (code == COND_EXPR)
7467     gcc_assert (ncopies == 1);
7468
7469   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7470
7471   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7472   if (reduction_type == FOLD_LEFT_REDUCTION)
7473     {
7474       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7475       return vectorize_fold_left_reduction
7476           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7477            reduc_fn, ops, vectype_in, reduc_index, masks);
7478     }
7479
7480   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7481   gcc_assert (single_defuse_cycle
7482               || code == DOT_PROD_EXPR
7483               || code == WIDEN_SUM_EXPR
7484               || code == SAD_EXPR);
7485
7486   /* Create the destination vector  */
7487   tree scalar_dest = gimple_assign_lhs (stmt);
7488   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7489
7490   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7491                      single_defuse_cycle && reduc_index == 0
7492                      ? NULL_TREE : ops[0], &vec_oprnds0,
7493                      single_defuse_cycle && reduc_index == 1
7494                      ? NULL_TREE : ops[1], &vec_oprnds1,
7495                      op_type == ternary_op
7496                      && !(single_defuse_cycle && reduc_index == 2)
7497                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7498   if (single_defuse_cycle)
7499     {
7500       gcc_assert (!slp_node);
7501       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7502                                      ops[reduc_index],
7503                                      reduc_index == 0 ? &vec_oprnds0
7504                                      : (reduc_index == 1 ? &vec_oprnds1
7505                                         : &vec_oprnds2));
7506     }
7507
7508   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7509     {
7510       gimple *new_stmt;
7511       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7512       if (masked_loop_p && !mask_by_cond_expr)
7513         {
7514           /* Make sure that the reduction accumulator is vop[0].  */
7515           if (reduc_index == 1)
7516             {
7517               gcc_assert (commutative_tree_code (code));
7518               std::swap (vop[0], vop[1]);
7519             }
7520           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7521                                           vectype_in, i);
7522           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7523                                                     vop[0], vop[1], vop[0]);
7524           new_temp = make_ssa_name (vec_dest, call);
7525           gimple_call_set_lhs (call, new_temp);
7526           gimple_call_set_nothrow (call, true);
7527           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7528           new_stmt = call;
7529         }
7530       else
7531         {
7532           if (op_type == ternary_op)
7533             vop[2] = vec_oprnds2[i];
7534
7535           if (masked_loop_p && mask_by_cond_expr)
7536             {
7537               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7538                                               vectype_in, i);
7539               build_vect_cond_expr (code, vop, mask, gsi);
7540             }
7541
7542           new_stmt = gimple_build_assign (vec_dest, code,
7543                                           vop[0], vop[1], vop[2]);
7544           new_temp = make_ssa_name (vec_dest, new_stmt);
7545           gimple_assign_set_lhs (new_stmt, new_temp);
7546           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7547         }
7548
7549       if (slp_node)
7550         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7551       else if (single_defuse_cycle
7552                && i < ncopies - 1)
7553         {
7554           if (reduc_index == 0)
7555             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7556           else if (reduc_index == 1)
7557             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7558           else if (reduc_index == 2)
7559             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7560         }
7561       else
7562         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7563     }
7564
7565   if (!slp_node)
7566     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7567
7568   return true;
7569 }
7570
7571 /* Transform phase of a cycle PHI.  */
7572
7573 bool
7574 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7575                           stmt_vec_info stmt_info, gimple **vec_stmt,
7576                           slp_tree slp_node, slp_instance slp_node_instance)
7577 {
7578   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7579   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7580   int i;
7581   int ncopies;
7582   int j;
7583   bool nested_cycle = false;
7584   int vec_num;
7585
7586   if (nested_in_vect_loop_p (loop, stmt_info))
7587     {
7588       loop = loop->inner;
7589       nested_cycle = true;
7590     }
7591
7592   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7593   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7594   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7595   gcc_assert (reduc_info->is_reduc_info);
7596
7597   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7598       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7599     /* Leave the scalar phi in place.  */
7600     return true;
7601
7602   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7603   /* For a nested cycle we do not fill the above.  */
7604   if (!vectype_in)
7605     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7606   gcc_assert (vectype_in);
7607
7608   if (slp_node)
7609     {
7610       /* The size vect_schedule_slp_instance computes is off for us.  */
7611       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7612                                       * SLP_TREE_LANES (slp_node), vectype_in);
7613       ncopies = 1;
7614     }
7615   else
7616     {
7617       vec_num = 1;
7618       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7619     }
7620
7621   /* Check whether we should use a single PHI node and accumulate
7622      vectors to one before the backedge.  */
7623   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7624     ncopies = 1;
7625
7626   /* Create the destination vector  */
7627   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7628   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7629                                                vectype_out);
7630
7631   /* Get the loop-entry arguments.  */
7632   tree vec_initial_def = NULL_TREE;
7633   auto_vec<tree> vec_initial_defs;
7634   if (slp_node)
7635     {
7636       vec_initial_defs.reserve (vec_num);
7637       if (nested_cycle)
7638         {
7639           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7640           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7641                              &vec_initial_defs);
7642         }
7643       else
7644         {
7645           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7646           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7647           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7648
7649           unsigned int num_phis = stmts.length ();
7650           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7651             num_phis = 1;
7652           initial_values.reserve (num_phis);
7653           for (unsigned int i = 0; i < num_phis; ++i)
7654             {
7655               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7656               initial_values.quick_push (vect_phi_initial_value (this_phi));
7657             }
7658           if (vec_num == 1)
7659             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7660           if (!initial_values.is_empty ())
7661             {
7662               tree initial_value
7663                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7664               tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7665               tree neutral_op
7666                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7667                                             code, initial_value);
7668               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7669                                               &vec_initial_defs, vec_num,
7670                                               stmts.length (), neutral_op);
7671             }
7672         }
7673     }
7674   else
7675     {
7676       /* Get at the scalar def before the loop, that defines the initial
7677          value of the reduction variable.  */
7678       tree initial_def = vect_phi_initial_value (phi);
7679       reduc_info->reduc_initial_values.safe_push (initial_def);
7680       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7681          and we can't use zero for induc_val, use initial_def.  Similarly
7682          for REDUC_MIN and initial_def larger than the base.  */
7683       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7684         {
7685           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7686           if (TREE_CODE (initial_def) == INTEGER_CST
7687               && !integer_zerop (induc_val)
7688               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7689                    && tree_int_cst_lt (initial_def, induc_val))
7690                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7691                       && tree_int_cst_lt (induc_val, initial_def))))
7692             {
7693               induc_val = initial_def;
7694               /* Communicate we used the initial_def to epilouge
7695                  generation.  */
7696               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7697             }
7698           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7699         }
7700       else if (nested_cycle)
7701         {
7702           /* Do not use an adjustment def as that case is not supported
7703              correctly if ncopies is not one.  */
7704           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7705                                          ncopies, initial_def,
7706                                          &vec_initial_defs);
7707         }
7708       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7709                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7710         /* Fill the initial vector with the initial scalar value.  */
7711         vec_initial_def
7712           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7713                                            initial_def, initial_def);
7714       else
7715         {
7716           if (ncopies == 1)
7717             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7718           if (!reduc_info->reduc_initial_values.is_empty ())
7719             {
7720               initial_def = reduc_info->reduc_initial_values[0];
7721               enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7722               tree neutral_op
7723                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7724                                             code, initial_def);
7725               gcc_assert (neutral_op);
7726               /* Try to simplify the vector initialization by applying an
7727                  adjustment after the reduction has been performed.  */
7728               if (!reduc_info->reused_accumulator
7729                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7730                   && !operand_equal_p (neutral_op, initial_def))
7731                 {
7732                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7733                     = initial_def;
7734                   initial_def = neutral_op;
7735                 }
7736               vec_initial_def
7737                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7738                                                  initial_def, neutral_op);
7739             }
7740         }
7741     }
7742
7743   if (vec_initial_def)
7744     {
7745       vec_initial_defs.create (ncopies);
7746       for (i = 0; i < ncopies; ++i)
7747         vec_initial_defs.quick_push (vec_initial_def);
7748     }
7749
7750   if (auto *accumulator = reduc_info->reused_accumulator)
7751     {
7752       tree def = accumulator->reduc_input;
7753       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7754         {
7755           unsigned int nreduc;
7756           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7757                                             (TREE_TYPE (def)),
7758                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7759                                           &nreduc);
7760           gcc_assert (res);
7761           gimple_seq stmts = NULL;
7762           /* Reduce the single vector to a smaller one.  */
7763           if (nreduc != 1)
7764             {
7765               /* Perform the reduction in the appropriate type.  */
7766               tree rvectype = vectype_out;
7767               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7768                                               TREE_TYPE (TREE_TYPE (def))))
7769                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7770                                               TYPE_VECTOR_SUBPARTS
7771                                                 (vectype_out));
7772               def = vect_create_partial_epilog (def, rvectype,
7773                                                 STMT_VINFO_REDUC_CODE
7774                                                   (reduc_info),
7775                                                 &stmts);
7776             }
7777           /* The epilogue loop might use a different vector mode, like
7778              VNx2DI vs. V2DI.  */
7779           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7780             {
7781               tree reduc_type = build_vector_type_for_mode
7782                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7783               def = gimple_convert (&stmts, reduc_type, def);
7784             }
7785           /* Adjust the input so we pick up the partially reduced value
7786              for the skip edge in vect_create_epilog_for_reduction.  */
7787           accumulator->reduc_input = def;
7788           /* And the reduction could be carried out using a different sign.  */
7789           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7790             def = gimple_convert (&stmts, vectype_out, def);
7791           if (loop_vinfo->main_loop_edge)
7792             {
7793               /* While we'd like to insert on the edge this will split
7794                  blocks and disturb bookkeeping, we also will eventually
7795                  need this on the skip edge.  Rely on sinking to
7796                  fixup optimal placement and insert in the pred.  */
7797               gimple_stmt_iterator gsi
7798                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7799               /* Insert before a cond that eventually skips the
7800                  epilogue.  */
7801               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7802                 gsi_prev (&gsi);
7803               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7804             }
7805           else
7806             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7807                                               stmts);
7808         }
7809       if (loop_vinfo->main_loop_edge)
7810         vec_initial_defs[0]
7811           = vect_get_main_loop_result (loop_vinfo, def,
7812                                        vec_initial_defs[0]);
7813       else
7814         vec_initial_defs.safe_push (def);
7815     }
7816
7817   /* Generate the reduction PHIs upfront.  */
7818   for (i = 0; i < vec_num; i++)
7819     {
7820       tree vec_init_def = vec_initial_defs[i];
7821       for (j = 0; j < ncopies; j++)
7822         {
7823           /* Create the reduction-phi that defines the reduction
7824              operand.  */
7825           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7826
7827           /* Set the loop-entry arg of the reduction-phi.  */
7828           if (j != 0 && nested_cycle)
7829             vec_init_def = vec_initial_defs[j];
7830           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7831                        UNKNOWN_LOCATION);
7832
7833           /* The loop-latch arg is set in epilogue processing.  */
7834
7835           if (slp_node)
7836             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7837           else
7838             {
7839               if (j == 0)
7840                 *vec_stmt = new_phi;
7841               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7842             }
7843         }
7844     }
7845
7846   return true;
7847 }
7848
7849 /* Vectorizes LC PHIs.  */
7850
7851 bool
7852 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7853                      stmt_vec_info stmt_info, gimple **vec_stmt,
7854                      slp_tree slp_node)
7855 {
7856   if (!loop_vinfo
7857       || !is_a <gphi *> (stmt_info->stmt)
7858       || gimple_phi_num_args (stmt_info->stmt) != 1)
7859     return false;
7860
7861   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7862       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7863     return false;
7864
7865   if (!vec_stmt) /* transformation not required.  */
7866     {
7867       /* Deal with copies from externs or constants that disguise as
7868          loop-closed PHI nodes (PR97886).  */
7869       if (slp_node
7870           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7871                                                 SLP_TREE_VECTYPE (slp_node)))
7872         {
7873           if (dump_enabled_p ())
7874             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7875                              "incompatible vector types for invariants\n");
7876           return false;
7877         }
7878       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7879       return true;
7880     }
7881
7882   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7883   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7884   basic_block bb = gimple_bb (stmt_info->stmt);
7885   edge e = single_pred_edge (bb);
7886   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7887   auto_vec<tree> vec_oprnds;
7888   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7889                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7890                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7891   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7892     {
7893       /* Create the vectorized LC PHI node.  */
7894       gphi *new_phi = create_phi_node (vec_dest, bb);
7895       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7896       if (slp_node)
7897         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7898       else
7899         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7900     }
7901   if (!slp_node)
7902     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7903
7904   return true;
7905 }
7906
7907 /* Vectorizes PHIs.  */
7908
7909 bool
7910 vectorizable_phi (vec_info *,
7911                   stmt_vec_info stmt_info, gimple **vec_stmt,
7912                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7913 {
7914   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7915     return false;
7916
7917   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7918     return false;
7919
7920   tree vectype = SLP_TREE_VECTYPE (slp_node);
7921
7922   if (!vec_stmt) /* transformation not required.  */
7923     {
7924       slp_tree child;
7925       unsigned i;
7926       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7927         if (!child)
7928           {
7929             if (dump_enabled_p ())
7930               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7931                                "PHI node with unvectorized backedge def\n");
7932             return false;
7933           }
7934         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7935           {
7936             if (dump_enabled_p ())
7937               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7938                                "incompatible vector types for invariants\n");
7939             return false;
7940           }
7941       /* For single-argument PHIs assume coalescing which means zero cost
7942          for the scalar and the vector PHIs.  This avoids artificially
7943          favoring the vector path (but may pessimize it in some cases).  */
7944       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7945         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7946                           vector_stmt, stmt_info, vectype, 0, vect_body);
7947       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7948       return true;
7949     }
7950
7951   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7952   basic_block bb = gimple_bb (stmt_info->stmt);
7953   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7954   auto_vec<gphi *> new_phis;
7955   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7956     {
7957       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7958
7959       /* Skip not yet vectorized defs.  */
7960       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7961           && SLP_TREE_VEC_STMTS (child).is_empty ())
7962         continue;
7963
7964       auto_vec<tree> vec_oprnds;
7965       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7966       if (!new_phis.exists ())
7967         {
7968           new_phis.create (vec_oprnds.length ());
7969           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7970             {
7971               /* Create the vectorized LC PHI node.  */
7972               new_phis.quick_push (create_phi_node (vec_dest, bb));
7973               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7974             }
7975         }
7976       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7977       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7978         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7979     }
7980   /* We should have at least one already vectorized child.  */
7981   gcc_assert (new_phis.exists ());
7982
7983   return true;
7984 }
7985
7986 /* Return true if VECTYPE represents a vector that requires lowering
7987    by the vector lowering pass.  */
7988
7989 bool
7990 vect_emulated_vector_p (tree vectype)
7991 {
7992   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7993           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7994               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7995 }
7996
7997 /* Return true if we can emulate CODE on an integer mode representation
7998    of a vector.  */
7999
8000 bool
8001 vect_can_vectorize_without_simd_p (tree_code code)
8002 {
8003   switch (code)
8004     {
8005     case PLUS_EXPR:
8006     case MINUS_EXPR:
8007     case NEGATE_EXPR:
8008     case BIT_AND_EXPR:
8009     case BIT_IOR_EXPR:
8010     case BIT_XOR_EXPR:
8011     case BIT_NOT_EXPR:
8012       return true;
8013
8014     default:
8015       return false;
8016     }
8017 }
8018
8019 /* Function vectorizable_induction
8020
8021    Check if STMT_INFO performs an induction computation that can be vectorized.
8022    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8023    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8024    Return true if STMT_INFO is vectorizable in this way.  */
8025
8026 bool
8027 vectorizable_induction (loop_vec_info loop_vinfo,
8028                         stmt_vec_info stmt_info,
8029                         gimple **vec_stmt, slp_tree slp_node,
8030                         stmt_vector_for_cost *cost_vec)
8031 {
8032   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8033   unsigned ncopies;
8034   bool nested_in_vect_loop = false;
8035   class loop *iv_loop;
8036   tree vec_def;
8037   edge pe = loop_preheader_edge (loop);
8038   basic_block new_bb;
8039   tree new_vec, vec_init, vec_step, t;
8040   tree new_name;
8041   gimple *new_stmt;
8042   gphi *induction_phi;
8043   tree induc_def, vec_dest;
8044   tree init_expr, step_expr;
8045   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8046   unsigned i;
8047   tree expr;
8048   gimple_stmt_iterator si;
8049
8050   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8051   if (!phi)
8052     return false;
8053
8054   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8055     return false;
8056
8057   /* Make sure it was recognized as induction computation.  */
8058   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8059     return false;
8060
8061   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8062   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8063
8064   if (slp_node)
8065     ncopies = 1;
8066   else
8067     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8068   gcc_assert (ncopies >= 1);
8069
8070   /* FORNOW. These restrictions should be relaxed.  */
8071   if (nested_in_vect_loop_p (loop, stmt_info))
8072     {
8073       imm_use_iterator imm_iter;
8074       use_operand_p use_p;
8075       gimple *exit_phi;
8076       edge latch_e;
8077       tree loop_arg;
8078
8079       if (ncopies > 1)
8080         {
8081           if (dump_enabled_p ())
8082             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8083                              "multiple types in nested loop.\n");
8084           return false;
8085         }
8086
8087       exit_phi = NULL;
8088       latch_e = loop_latch_edge (loop->inner);
8089       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8090       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8091         {
8092           gimple *use_stmt = USE_STMT (use_p);
8093           if (is_gimple_debug (use_stmt))
8094             continue;
8095
8096           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8097             {
8098               exit_phi = use_stmt;
8099               break;
8100             }
8101         }
8102       if (exit_phi)
8103         {
8104           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8105           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8106                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8107             {
8108               if (dump_enabled_p ())
8109                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8110                                  "inner-loop induction only used outside "
8111                                  "of the outer vectorized loop.\n");
8112               return false;
8113             }
8114         }
8115
8116       nested_in_vect_loop = true;
8117       iv_loop = loop->inner;
8118     }
8119   else
8120     iv_loop = loop;
8121   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8122
8123   if (slp_node && !nunits.is_constant ())
8124     {
8125       /* The current SLP code creates the step value element-by-element.  */
8126       if (dump_enabled_p ())
8127         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128                          "SLP induction not supported for variable-length"
8129                          " vectors.\n");
8130       return false;
8131     }
8132
8133   if (!vec_stmt) /* transformation not required.  */
8134     {
8135       unsigned inside_cost = 0, prologue_cost = 0;
8136       if (slp_node)
8137         {
8138           /* We eventually need to set a vector type on invariant
8139              arguments.  */
8140           unsigned j;
8141           slp_tree child;
8142           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8143             if (!vect_maybe_update_slp_op_vectype
8144                 (child, SLP_TREE_VECTYPE (slp_node)))
8145               {
8146                 if (dump_enabled_p ())
8147                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8148                                    "incompatible vector types for "
8149                                    "invariants\n");
8150                 return false;
8151               }
8152           /* loop cost for vec_loop.  */
8153           inside_cost
8154             = record_stmt_cost (cost_vec,
8155                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8156                                 vector_stmt, stmt_info, 0, vect_body);
8157           /* prologue cost for vec_init (if not nested) and step.  */
8158           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8159                                             scalar_to_vec,
8160                                             stmt_info, 0, vect_prologue);
8161         }
8162       else /* if (!slp_node) */
8163         {
8164           /* loop cost for vec_loop.  */
8165           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8166                                           stmt_info, 0, vect_body);
8167           /* prologue cost for vec_init and vec_step.  */
8168           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8169                                             stmt_info, 0, vect_prologue);
8170         }
8171       if (dump_enabled_p ())
8172         dump_printf_loc (MSG_NOTE, vect_location,
8173                          "vect_model_induction_cost: inside_cost = %d, "
8174                          "prologue_cost = %d .\n", inside_cost,
8175                          prologue_cost);
8176
8177       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8178       DUMP_VECT_SCOPE ("vectorizable_induction");
8179       return true;
8180     }
8181
8182   /* Transform.  */
8183
8184   /* Compute a vector variable, initialized with the first VF values of
8185      the induction variable.  E.g., for an iv with IV_PHI='X' and
8186      evolution S, for a vector of 4 units, we want to compute:
8187      [X, X + S, X + 2*S, X + 3*S].  */
8188
8189   if (dump_enabled_p ())
8190     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8191
8192   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8193   gcc_assert (step_expr != NULL_TREE);
8194   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8195
8196   pe = loop_preheader_edge (iv_loop);
8197   /* Find the first insertion point in the BB.  */
8198   basic_block bb = gimple_bb (phi);
8199   si = gsi_after_labels (bb);
8200
8201   /* For SLP induction we have to generate several IVs as for example
8202      with group size 3 we need
8203        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8204        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8205   if (slp_node)
8206     {
8207       /* Enforced above.  */
8208       unsigned int const_nunits = nunits.to_constant ();
8209
8210       /* The initial values are vectorized, but any lanes > group_size
8211          need adjustment.  */
8212       slp_tree init_node
8213         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8214
8215       /* Gather steps.  Since we do not vectorize inductions as
8216          cycles we have to reconstruct the step from SCEV data.  */
8217       unsigned group_size = SLP_TREE_LANES (slp_node);
8218       tree *steps = XALLOCAVEC (tree, group_size);
8219       tree *inits = XALLOCAVEC (tree, group_size);
8220       stmt_vec_info phi_info;
8221       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8222         {
8223           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8224           if (!init_node)
8225             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8226                                            pe->dest_idx);
8227         }
8228
8229       /* Now generate the IVs.  */
8230       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8231       gcc_assert ((const_nunits * nvects) % group_size == 0);
8232       unsigned nivs;
8233       if (nested_in_vect_loop)
8234         nivs = nvects;
8235       else
8236         {
8237           /* Compute the number of distinct IVs we need.  First reduce
8238              group_size if it is a multiple of const_nunits so we get
8239              one IV for a group_size of 4 but const_nunits 2.  */
8240           unsigned group_sizep = group_size;
8241           if (group_sizep % const_nunits == 0)
8242             group_sizep = group_sizep / const_nunits;
8243           nivs = least_common_multiple (group_sizep,
8244                                         const_nunits) / const_nunits;
8245         }
8246       tree stept = TREE_TYPE (step_vectype);
8247       tree lupdate_mul = NULL_TREE;
8248       if (!nested_in_vect_loop)
8249         {
8250           /* The number of iterations covered in one vector iteration.  */
8251           unsigned lup_mul = (nvects * const_nunits) / group_size;
8252           lupdate_mul
8253             = build_vector_from_val (step_vectype,
8254                                      SCALAR_FLOAT_TYPE_P (stept)
8255                                      ? build_real_from_wide (stept, lup_mul,
8256                                                              UNSIGNED)
8257                                      : build_int_cstu (stept, lup_mul));
8258         }
8259       tree peel_mul = NULL_TREE;
8260       gimple_seq init_stmts = NULL;
8261       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8262         {
8263           if (SCALAR_FLOAT_TYPE_P (stept))
8264             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8265                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8266           else
8267             peel_mul = gimple_convert (&init_stmts, stept,
8268                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8269           peel_mul = gimple_build_vector_from_val (&init_stmts,
8270                                                    step_vectype, peel_mul);
8271         }
8272       unsigned ivn;
8273       auto_vec<tree> vec_steps;
8274       for (ivn = 0; ivn < nivs; ++ivn)
8275         {
8276           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8277           tree_vector_builder init_elts (vectype, const_nunits, 1);
8278           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8279           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8280             {
8281               /* The scalar steps of the IVs.  */
8282               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8283               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8284               step_elts.quick_push (elt);
8285               if (!init_node)
8286                 {
8287                   /* The scalar inits of the IVs if not vectorized.  */
8288                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8289                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8290                                                   TREE_TYPE (elt)))
8291                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8292                                         TREE_TYPE (vectype), elt);
8293                   init_elts.quick_push (elt);
8294                 }
8295               /* The number of steps to add to the initial values.  */
8296               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8297               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8298                                    ? build_real_from_wide (stept,
8299                                                            mul_elt, UNSIGNED)
8300                                    : build_int_cstu (stept, mul_elt));
8301             }
8302           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8303           vec_steps.safe_push (vec_step);
8304           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8305           if (peel_mul)
8306             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8307                                      step_mul, peel_mul);
8308           if (!init_node)
8309             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8310
8311           /* Create the induction-phi that defines the induction-operand.  */
8312           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8313                                             "vec_iv_");
8314           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8315           induc_def = PHI_RESULT (induction_phi);
8316
8317           /* Create the iv update inside the loop  */
8318           tree up = vec_step;
8319           if (lupdate_mul)
8320             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8321                                vec_step, lupdate_mul);
8322           gimple_seq stmts = NULL;
8323           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8324           vec_def = gimple_build (&stmts,
8325                                   PLUS_EXPR, step_vectype, vec_def, up);
8326           vec_def = gimple_convert (&stmts, vectype, vec_def);
8327           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8328           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8329                        UNKNOWN_LOCATION);
8330
8331           if (init_node)
8332             vec_init = vect_get_slp_vect_def (init_node, ivn);
8333           if (!nested_in_vect_loop
8334               && !integer_zerop (step_mul))
8335             {
8336               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8337               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8338                                  vec_step, step_mul);
8339               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8340                                       vec_def, up);
8341               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8342             }
8343
8344           /* Set the arguments of the phi node:  */
8345           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8346
8347           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8348         }
8349       if (!nested_in_vect_loop)
8350         {
8351           /* Fill up to the number of vectors we need for the whole group.  */
8352           nivs = least_common_multiple (group_size,
8353                                         const_nunits) / const_nunits;
8354           vec_steps.reserve (nivs-ivn);
8355           for (; ivn < nivs; ++ivn)
8356             {
8357               SLP_TREE_VEC_STMTS (slp_node)
8358                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8359               vec_steps.quick_push (vec_steps[0]);
8360             }
8361         }
8362
8363       /* Re-use IVs when we can.  We are generating further vector
8364          stmts by adding VF' * stride to the IVs generated above.  */
8365       if (ivn < nvects)
8366         {
8367           unsigned vfp
8368             = least_common_multiple (group_size, const_nunits) / group_size;
8369           tree lupdate_mul
8370             = build_vector_from_val (step_vectype,
8371                                      SCALAR_FLOAT_TYPE_P (stept)
8372                                      ? build_real_from_wide (stept,
8373                                                              vfp, UNSIGNED)
8374                                      : build_int_cstu (stept, vfp));
8375           for (; ivn < nvects; ++ivn)
8376             {
8377               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8378               tree def = gimple_get_lhs (iv);
8379               if (ivn < 2*nivs)
8380                 vec_steps[ivn - nivs]
8381                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8382                                   vec_steps[ivn - nivs], lupdate_mul);
8383               gimple_seq stmts = NULL;
8384               def = gimple_convert (&stmts, step_vectype, def);
8385               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8386                                   def, vec_steps[ivn % nivs]);
8387               def = gimple_convert (&stmts, vectype, def);
8388               if (gimple_code (iv) == GIMPLE_PHI)
8389                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8390               else
8391                 {
8392                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8393                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8394                 }
8395               SLP_TREE_VEC_STMTS (slp_node)
8396                 .quick_push (SSA_NAME_DEF_STMT (def));
8397             }
8398         }
8399
8400       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8401       gcc_assert (!new_bb);
8402
8403       return true;
8404     }
8405
8406   init_expr = vect_phi_initial_value (phi);
8407
8408   gimple_seq stmts = NULL;
8409   if (!nested_in_vect_loop)
8410     {
8411       /* Convert the initial value to the IV update type.  */
8412       tree new_type = TREE_TYPE (step_expr);
8413       init_expr = gimple_convert (&stmts, new_type, init_expr);
8414
8415       /* If we are using the loop mask to "peel" for alignment then we need
8416          to adjust the start value here.  */
8417       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8418       if (skip_niters != NULL_TREE)
8419         {
8420           if (FLOAT_TYPE_P (vectype))
8421             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8422                                         skip_niters);
8423           else
8424             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8425           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8426                                          skip_niters, step_expr);
8427           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8428                                     init_expr, skip_step);
8429         }
8430     }
8431
8432   if (stmts)
8433     {
8434       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8435       gcc_assert (!new_bb);
8436     }
8437
8438   /* Create the vector that holds the initial_value of the induction.  */
8439   if (nested_in_vect_loop)
8440     {
8441       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8442          been created during vectorization of previous stmts.  We obtain it
8443          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8444       auto_vec<tree> vec_inits;
8445       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8446                                      init_expr, &vec_inits);
8447       vec_init = vec_inits[0];
8448       /* If the initial value is not of proper type, convert it.  */
8449       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8450         {
8451           new_stmt
8452             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8453                                                           vect_simple_var,
8454                                                           "vec_iv_"),
8455                                    VIEW_CONVERT_EXPR,
8456                                    build1 (VIEW_CONVERT_EXPR, vectype,
8457                                            vec_init));
8458           vec_init = gimple_assign_lhs (new_stmt);
8459           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8460                                                  new_stmt);
8461           gcc_assert (!new_bb);
8462         }
8463     }
8464   else
8465     {
8466       /* iv_loop is the loop to be vectorized. Create:
8467          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8468       stmts = NULL;
8469       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8470
8471       unsigned HOST_WIDE_INT const_nunits;
8472       if (nunits.is_constant (&const_nunits))
8473         {
8474           tree_vector_builder elts (step_vectype, const_nunits, 1);
8475           elts.quick_push (new_name);
8476           for (i = 1; i < const_nunits; i++)
8477             {
8478               /* Create: new_name_i = new_name + step_expr  */
8479               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8480                                        new_name, step_expr);
8481               elts.quick_push (new_name);
8482             }
8483           /* Create a vector from [new_name_0, new_name_1, ...,
8484              new_name_nunits-1]  */
8485           vec_init = gimple_build_vector (&stmts, &elts);
8486         }
8487       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8488         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8489         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8490                                  new_name, step_expr);
8491       else
8492         {
8493           /* Build:
8494                 [base, base, base, ...]
8495                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8496           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8497           gcc_assert (flag_associative_math);
8498           tree index = build_index_vector (step_vectype, 0, 1);
8499           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8500                                                         new_name);
8501           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8502                                                         step_expr);
8503           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8504           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8505                                    vec_init, step_vec);
8506           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8507                                    vec_init, base_vec);
8508         }
8509       vec_init = gimple_convert (&stmts, vectype, vec_init);
8510
8511       if (stmts)
8512         {
8513           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8514           gcc_assert (!new_bb);
8515         }
8516     }
8517
8518
8519   /* Create the vector that holds the step of the induction.  */
8520   if (nested_in_vect_loop)
8521     /* iv_loop is nested in the loop to be vectorized. Generate:
8522        vec_step = [S, S, S, S]  */
8523     new_name = step_expr;
8524   else
8525     {
8526       /* iv_loop is the loop to be vectorized. Generate:
8527           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8528       gimple_seq seq = NULL;
8529       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8530         {
8531           expr = build_int_cst (integer_type_node, vf);
8532           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8533         }
8534       else
8535         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8536       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8537                                expr, step_expr);
8538       if (seq)
8539         {
8540           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8541           gcc_assert (!new_bb);
8542         }
8543     }
8544
8545   t = unshare_expr (new_name);
8546   gcc_assert (CONSTANT_CLASS_P (new_name)
8547               || TREE_CODE (new_name) == SSA_NAME);
8548   new_vec = build_vector_from_val (step_vectype, t);
8549   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8550                                new_vec, step_vectype, NULL);
8551
8552
8553   /* Create the following def-use cycle:
8554      loop prolog:
8555          vec_init = ...
8556          vec_step = ...
8557      loop:
8558          vec_iv = PHI <vec_init, vec_loop>
8559          ...
8560          STMT
8561          ...
8562          vec_loop = vec_iv + vec_step;  */
8563
8564   /* Create the induction-phi that defines the induction-operand.  */
8565   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8566   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8567   induc_def = PHI_RESULT (induction_phi);
8568
8569   /* Create the iv update inside the loop  */
8570   stmts = NULL;
8571   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8572   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8573   vec_def = gimple_convert (&stmts, vectype, vec_def);
8574   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8575   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8576
8577   /* Set the arguments of the phi node:  */
8578   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8579   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8580                UNKNOWN_LOCATION);
8581
8582   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8583   *vec_stmt = induction_phi;
8584
8585   /* In case that vectorization factor (VF) is bigger than the number
8586      of elements that we can fit in a vectype (nunits), we have to generate
8587      more than one vector stmt - i.e - we need to "unroll" the
8588      vector stmt by a factor VF/nunits.  For more details see documentation
8589      in vectorizable_operation.  */
8590
8591   if (ncopies > 1)
8592     {
8593       gimple_seq seq = NULL;
8594       /* FORNOW. This restriction should be relaxed.  */
8595       gcc_assert (!nested_in_vect_loop);
8596
8597       /* Create the vector that holds the step of the induction.  */
8598       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8599         {
8600           expr = build_int_cst (integer_type_node, nunits);
8601           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8602         }
8603       else
8604         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8605       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8606                                expr, step_expr);
8607       if (seq)
8608         {
8609           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8610           gcc_assert (!new_bb);
8611         }
8612
8613       t = unshare_expr (new_name);
8614       gcc_assert (CONSTANT_CLASS_P (new_name)
8615                   || TREE_CODE (new_name) == SSA_NAME);
8616       new_vec = build_vector_from_val (step_vectype, t);
8617       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8618                                    new_vec, step_vectype, NULL);
8619
8620       vec_def = induc_def;
8621       for (i = 1; i < ncopies; i++)
8622         {
8623           /* vec_i = vec_prev + vec_step  */
8624           gimple_seq stmts = NULL;
8625           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8626           vec_def = gimple_build (&stmts,
8627                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8628           vec_def = gimple_convert (&stmts, vectype, vec_def);
8629
8630           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8631           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8632           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8633         }
8634     }
8635
8636   if (dump_enabled_p ())
8637     dump_printf_loc (MSG_NOTE, vect_location,
8638                      "transform induction: created def-use cycle: %G%G",
8639                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8640
8641   return true;
8642 }
8643
8644 /* Function vectorizable_live_operation.
8645
8646    STMT_INFO computes a value that is used outside the loop.  Check if
8647    it can be supported.  */
8648
8649 bool
8650 vectorizable_live_operation (vec_info *vinfo,
8651                              stmt_vec_info stmt_info,
8652                              gimple_stmt_iterator *gsi,
8653                              slp_tree slp_node, slp_instance slp_node_instance,
8654                              int slp_index, bool vec_stmt_p,
8655                              stmt_vector_for_cost *cost_vec)
8656 {
8657   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8658   imm_use_iterator imm_iter;
8659   tree lhs, lhs_type, bitsize;
8660   tree vectype = (slp_node
8661                   ? SLP_TREE_VECTYPE (slp_node)
8662                   : STMT_VINFO_VECTYPE (stmt_info));
8663   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8664   int ncopies;
8665   gimple *use_stmt;
8666   auto_vec<tree> vec_oprnds;
8667   int vec_entry = 0;
8668   poly_uint64 vec_index = 0;
8669
8670   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8671
8672   /* If a stmt of a reduction is live, vectorize it via
8673      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8674      validity so just trigger the transform here.  */
8675   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8676     {
8677       if (!vec_stmt_p)
8678         return true;
8679       if (slp_node)
8680         {
8681           /* For reduction chains the meta-info is attached to
8682              the group leader.  */
8683           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8684             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8685           /* For SLP reductions we vectorize the epilogue for
8686              all involved stmts together.  */
8687           else if (slp_index != 0)
8688             return true;
8689           else
8690             /* For SLP reductions the meta-info is attached to
8691                the representative.  */
8692             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8693         }
8694       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8695       gcc_assert (reduc_info->is_reduc_info);
8696       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8697           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8698         return true;
8699       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8700                                         slp_node_instance);
8701       return true;
8702     }
8703
8704   /* If STMT is not relevant and it is a simple assignment and its inputs are
8705      invariant then it can remain in place, unvectorized.  The original last
8706      scalar value that it computes will be used.  */
8707   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8708     {
8709       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8710       if (dump_enabled_p ())
8711         dump_printf_loc (MSG_NOTE, vect_location,
8712                          "statement is simple and uses invariant.  Leaving in "
8713                          "place.\n");
8714       return true;
8715     }
8716
8717   if (slp_node)
8718     ncopies = 1;
8719   else
8720     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8721
8722   if (slp_node)
8723     {
8724       gcc_assert (slp_index >= 0);
8725
8726       /* Get the last occurrence of the scalar index from the concatenation of
8727          all the slp vectors. Calculate which slp vector it is and the index
8728          within.  */
8729       int num_scalar = SLP_TREE_LANES (slp_node);
8730       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8731       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8732
8733       /* Calculate which vector contains the result, and which lane of
8734          that vector we need.  */
8735       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8736         {
8737           if (dump_enabled_p ())
8738             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8739                              "Cannot determine which vector holds the"
8740                              " final result.\n");
8741           return false;
8742         }
8743     }
8744
8745   if (!vec_stmt_p)
8746     {
8747       /* No transformation required.  */
8748       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8749         {
8750           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8751                                                OPTIMIZE_FOR_SPEED))
8752             {
8753               if (dump_enabled_p ())
8754                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8755                                  "can't operate on partial vectors "
8756                                  "because the target doesn't support extract "
8757                                  "last reduction.\n");
8758               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8759             }
8760           else if (slp_node)
8761             {
8762               if (dump_enabled_p ())
8763                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8764                                  "can't operate on partial vectors "
8765                                  "because an SLP statement is live after "
8766                                  "the loop.\n");
8767               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8768             }
8769           else if (ncopies > 1)
8770             {
8771               if (dump_enabled_p ())
8772                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8773                                  "can't operate on partial vectors "
8774                                  "because ncopies is greater than 1.\n");
8775               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8776             }
8777           else
8778             {
8779               gcc_assert (ncopies == 1 && !slp_node);
8780               vect_record_loop_mask (loop_vinfo,
8781                                      &LOOP_VINFO_MASKS (loop_vinfo),
8782                                      1, vectype, NULL);
8783             }
8784         }
8785       /* ???  Enable for loop costing as well.  */
8786       if (!loop_vinfo)
8787         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8788                           0, vect_epilogue);
8789       return true;
8790     }
8791
8792   /* Use the lhs of the original scalar statement.  */
8793   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8794   if (dump_enabled_p ())
8795     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8796                      "stmt %G", stmt);
8797
8798   lhs = gimple_get_lhs (stmt);
8799   lhs_type = TREE_TYPE (lhs);
8800
8801   bitsize = vector_element_bits_tree (vectype);
8802
8803   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8804   tree vec_lhs, bitstart;
8805   gimple *vec_stmt;
8806   if (slp_node)
8807     {
8808       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8809
8810       /* Get the correct slp vectorized stmt.  */
8811       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8812       vec_lhs = gimple_get_lhs (vec_stmt);
8813
8814       /* Get entry to use.  */
8815       bitstart = bitsize_int (vec_index);
8816       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8817     }
8818   else
8819     {
8820       /* For multiple copies, get the last copy.  */
8821       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8822       vec_lhs = gimple_get_lhs (vec_stmt);
8823
8824       /* Get the last lane in the vector.  */
8825       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8826     }
8827
8828   if (loop_vinfo)
8829     {
8830       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8831          requirement, insert one phi node for it.  It looks like:
8832            loop;
8833          BB:
8834            # lhs' = PHI <lhs>
8835          ==>
8836            loop;
8837          BB:
8838            # vec_lhs' = PHI <vec_lhs>
8839            new_tree = lane_extract <vec_lhs', ...>;
8840            lhs' = new_tree;  */
8841
8842       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8843       basic_block exit_bb = single_exit (loop)->dest;
8844       gcc_assert (single_pred_p (exit_bb));
8845
8846       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8847       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8848       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8849
8850       gimple_seq stmts = NULL;
8851       tree new_tree;
8852       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8853         {
8854           /* Emit:
8855
8856                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8857
8858              where VEC_LHS is the vectorized live-out result and MASK is
8859              the loop mask for the final iteration.  */
8860           gcc_assert (ncopies == 1 && !slp_node);
8861           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8862           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8863                                           1, vectype, 0);
8864           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8865                                           mask, vec_lhs_phi);
8866
8867           /* Convert the extracted vector element to the scalar type.  */
8868           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8869         }
8870       else
8871         {
8872           tree bftype = TREE_TYPE (vectype);
8873           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8874             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8875           new_tree = build3 (BIT_FIELD_REF, bftype,
8876                              vec_lhs_phi, bitsize, bitstart);
8877           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8878                                            &stmts, true, NULL_TREE);
8879         }
8880
8881       if (stmts)
8882         {
8883           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8884           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8885
8886           /* Remove existing phi from lhs and create one copy from new_tree.  */
8887           tree lhs_phi = NULL_TREE;
8888           gimple_stmt_iterator gsi;
8889           for (gsi = gsi_start_phis (exit_bb);
8890                !gsi_end_p (gsi); gsi_next (&gsi))
8891             {
8892               gimple *phi = gsi_stmt (gsi);
8893               if ((gimple_phi_arg_def (phi, 0) == lhs))
8894                 {
8895                   remove_phi_node (&gsi, false);
8896                   lhs_phi = gimple_phi_result (phi);
8897                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8898                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8899                   break;
8900                 }
8901             }
8902         }
8903
8904       /* Replace use of lhs with newly computed result.  If the use stmt is a
8905          single arg PHI, just replace all uses of PHI result.  It's necessary
8906          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8907       use_operand_p use_p;
8908       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8909         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8910             && !is_gimple_debug (use_stmt))
8911           {
8912             if (gimple_code (use_stmt) == GIMPLE_PHI
8913                 && gimple_phi_num_args (use_stmt) == 1)
8914               {
8915                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8916               }
8917             else
8918               {
8919                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8920                     SET_USE (use_p, new_tree);
8921               }
8922             update_stmt (use_stmt);
8923           }
8924     }
8925   else
8926     {
8927       /* For basic-block vectorization simply insert the lane-extraction.  */
8928       tree bftype = TREE_TYPE (vectype);
8929       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8930         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8931       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8932                               vec_lhs, bitsize, bitstart);
8933       gimple_seq stmts = NULL;
8934       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8935                                        &stmts, true, NULL_TREE);
8936       if (TREE_CODE (new_tree) == SSA_NAME
8937           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8938         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8939       if (is_a <gphi *> (vec_stmt))
8940         {
8941           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8942           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8943         }
8944       else
8945         {
8946           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8947           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8948         }
8949
8950       /* Replace use of lhs with newly computed result.  If the use stmt is a
8951          single arg PHI, just replace all uses of PHI result.  It's necessary
8952          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8953       use_operand_p use_p;
8954       stmt_vec_info use_stmt_info;
8955       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8956         if (!is_gimple_debug (use_stmt)
8957             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8958                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8959           {
8960             /* ???  This can happen when the live lane ends up being
8961                used in a vector construction code-generated by an
8962                external SLP node (and code-generation for that already
8963                happened).  See gcc.dg/vect/bb-slp-47.c.
8964                Doing this is what would happen if that vector CTOR
8965                were not code-generated yet so it is not too bad.
8966                ???  In fact we'd likely want to avoid this situation
8967                in the first place.  */
8968             if (TREE_CODE (new_tree) == SSA_NAME
8969                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8970                 && gimple_code (use_stmt) != GIMPLE_PHI
8971                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8972                                                 use_stmt))
8973               {
8974                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8975                 gcc_assert (code == CONSTRUCTOR
8976                             || code == VIEW_CONVERT_EXPR
8977                             || CONVERT_EXPR_CODE_P (code));
8978                 if (dump_enabled_p ())
8979                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8980                                    "Using original scalar computation for "
8981                                    "live lane because use preceeds vector "
8982                                    "def\n");
8983                 continue;
8984               }
8985             /* ???  It can also happen that we end up pulling a def into
8986                a loop where replacing out-of-loop uses would require
8987                a new LC SSA PHI node.  Retain the original scalar in
8988                those cases as well.  PR98064.  */
8989             if (TREE_CODE (new_tree) == SSA_NAME
8990                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8991                 && (gimple_bb (use_stmt)->loop_father
8992                     != gimple_bb (vec_stmt)->loop_father)
8993                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8994                                         gimple_bb (use_stmt)->loop_father))
8995               {
8996                 if (dump_enabled_p ())
8997                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8998                                    "Using original scalar computation for "
8999                                    "live lane because there is an out-of-loop "
9000                                    "definition for it\n");
9001                 continue;
9002               }
9003             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9004               SET_USE (use_p, new_tree);
9005             update_stmt (use_stmt);
9006           }
9007     }
9008
9009   return true;
9010 }
9011
9012 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9013
9014 static void
9015 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9016 {
9017   ssa_op_iter op_iter;
9018   imm_use_iterator imm_iter;
9019   def_operand_p def_p;
9020   gimple *ustmt;
9021
9022   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9023     {
9024       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9025         {
9026           basic_block bb;
9027
9028           if (!is_gimple_debug (ustmt))
9029             continue;
9030
9031           bb = gimple_bb (ustmt);
9032
9033           if (!flow_bb_inside_loop_p (loop, bb))
9034             {
9035               if (gimple_debug_bind_p (ustmt))
9036                 {
9037                   if (dump_enabled_p ())
9038                     dump_printf_loc (MSG_NOTE, vect_location,
9039                                      "killing debug use\n");
9040
9041                   gimple_debug_bind_reset_value (ustmt);
9042                   update_stmt (ustmt);
9043                 }
9044               else
9045                 gcc_unreachable ();
9046             }
9047         }
9048     }
9049 }
9050
9051 /* Given loop represented by LOOP_VINFO, return true if computation of
9052    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9053    otherwise.  */
9054
9055 static bool
9056 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9057 {
9058   /* Constant case.  */
9059   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9060     {
9061       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9062       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9063
9064       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9065       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9066       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9067         return true;
9068     }
9069
9070   widest_int max;
9071   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9072   /* Check the upper bound of loop niters.  */
9073   if (get_max_loop_iterations (loop, &max))
9074     {
9075       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9076       signop sgn = TYPE_SIGN (type);
9077       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9078       if (max < type_max)
9079         return true;
9080     }
9081   return false;
9082 }
9083
9084 /* Return a mask type with half the number of elements as OLD_TYPE,
9085    given that it should have mode NEW_MODE.  */
9086
9087 tree
9088 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9089 {
9090   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9091   return build_truth_vector_type_for_mode (nunits, new_mode);
9092 }
9093
9094 /* Return a mask type with twice as many elements as OLD_TYPE,
9095    given that it should have mode NEW_MODE.  */
9096
9097 tree
9098 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9099 {
9100   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9101   return build_truth_vector_type_for_mode (nunits, new_mode);
9102 }
9103
9104 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9105    contain a sequence of NVECTORS masks that each control a vector of type
9106    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9107    these vector masks with the vector version of SCALAR_MASK.  */
9108
9109 void
9110 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9111                        unsigned int nvectors, tree vectype, tree scalar_mask)
9112 {
9113   gcc_assert (nvectors != 0);
9114   if (masks->length () < nvectors)
9115     masks->safe_grow_cleared (nvectors, true);
9116   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9117   /* The number of scalars per iteration and the number of vectors are
9118      both compile-time constants.  */
9119   unsigned int nscalars_per_iter
9120     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9121                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9122
9123   if (scalar_mask)
9124     {
9125       scalar_cond_masked_key cond (scalar_mask, nvectors);
9126       loop_vinfo->scalar_cond_masked_set.add (cond);
9127     }
9128
9129   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9130     {
9131       rgm->max_nscalars_per_iter = nscalars_per_iter;
9132       rgm->type = truth_type_for (vectype);
9133       rgm->factor = 1;
9134     }
9135 }
9136
9137 /* Given a complete set of masks MASKS, extract mask number INDEX
9138    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9139    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9140
9141    See the comment above vec_loop_masks for more details about the mask
9142    arrangement.  */
9143
9144 tree
9145 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9146                     unsigned int nvectors, tree vectype, unsigned int index)
9147 {
9148   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9149   tree mask_type = rgm->type;
9150
9151   /* Populate the rgroup's mask array, if this is the first time we've
9152      used it.  */
9153   if (rgm->controls.is_empty ())
9154     {
9155       rgm->controls.safe_grow_cleared (nvectors, true);
9156       for (unsigned int i = 0; i < nvectors; ++i)
9157         {
9158           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9159           /* Provide a dummy definition until the real one is available.  */
9160           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9161           rgm->controls[i] = mask;
9162         }
9163     }
9164
9165   tree mask = rgm->controls[index];
9166   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9167                 TYPE_VECTOR_SUBPARTS (vectype)))
9168     {
9169       /* A loop mask for data type X can be reused for data type Y
9170          if X has N times more elements than Y and if Y's elements
9171          are N times bigger than X's.  In this case each sequence
9172          of N elements in the loop mask will be all-zero or all-one.
9173          We can then view-convert the mask so that each sequence of
9174          N elements is replaced by a single element.  */
9175       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9176                               TYPE_VECTOR_SUBPARTS (vectype)));
9177       gimple_seq seq = NULL;
9178       mask_type = truth_type_for (vectype);
9179       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9180       if (seq)
9181         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9182     }
9183   return mask;
9184 }
9185
9186 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9187    lengths for controlling an operation on VECTYPE.  The operation splits
9188    each element of VECTYPE into FACTOR separate subelements, measuring the
9189    length as a number of these subelements.  */
9190
9191 void
9192 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9193                       unsigned int nvectors, tree vectype, unsigned int factor)
9194 {
9195   gcc_assert (nvectors != 0);
9196   if (lens->length () < nvectors)
9197     lens->safe_grow_cleared (nvectors, true);
9198   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9199
9200   /* The number of scalars per iteration, scalar occupied bytes and
9201      the number of vectors are both compile-time constants.  */
9202   unsigned int nscalars_per_iter
9203     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9204                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9205
9206   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9207     {
9208       /* For now, we only support cases in which all loads and stores fall back
9209          to VnQI or none do.  */
9210       gcc_assert (!rgl->max_nscalars_per_iter
9211                   || (rgl->factor == 1 && factor == 1)
9212                   || (rgl->max_nscalars_per_iter * rgl->factor
9213                       == nscalars_per_iter * factor));
9214       rgl->max_nscalars_per_iter = nscalars_per_iter;
9215       rgl->type = vectype;
9216       rgl->factor = factor;
9217     }
9218 }
9219
9220 /* Given a complete set of length LENS, extract length number INDEX for an
9221    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9222
9223 tree
9224 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9225                    unsigned int nvectors, unsigned int index)
9226 {
9227   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9228
9229   /* Populate the rgroup's len array, if this is the first time we've
9230      used it.  */
9231   if (rgl->controls.is_empty ())
9232     {
9233       rgl->controls.safe_grow_cleared (nvectors, true);
9234       for (unsigned int i = 0; i < nvectors; ++i)
9235         {
9236           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9237           gcc_assert (len_type != NULL_TREE);
9238           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9239
9240           /* Provide a dummy definition until the real one is available.  */
9241           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9242           rgl->controls[i] = len;
9243         }
9244     }
9245
9246   return rgl->controls[index];
9247 }
9248
9249 /* Scale profiling counters by estimation for LOOP which is vectorized
9250    by factor VF.  */
9251
9252 static void
9253 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9254 {
9255   edge preheader = loop_preheader_edge (loop);
9256   /* Reduce loop iterations by the vectorization factor.  */
9257   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9258   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9259
9260   if (freq_h.nonzero_p ())
9261     {
9262       profile_probability p;
9263
9264       /* Avoid dropping loop body profile counter to 0 because of zero count
9265          in loop's preheader.  */
9266       if (!(freq_e == profile_count::zero ()))
9267         freq_e = freq_e.force_nonzero ();
9268       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9269       scale_loop_frequencies (loop, p);
9270     }
9271
9272   edge exit_e = single_exit (loop);
9273   exit_e->probability = profile_probability::always ()
9274                                  .apply_scale (1, new_est_niter + 1);
9275
9276   edge exit_l = single_pred_edge (loop->latch);
9277   profile_probability prob = exit_l->probability;
9278   exit_l->probability = exit_e->probability.invert ();
9279   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9280     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9281 }
9282
9283 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9284    latch edge values originally defined by it.  */
9285
9286 static void
9287 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9288                                      stmt_vec_info def_stmt_info)
9289 {
9290   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9291   if (!def || TREE_CODE (def) != SSA_NAME)
9292     return;
9293   stmt_vec_info phi_info;
9294   imm_use_iterator iter;
9295   use_operand_p use_p;
9296   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9297     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9298       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9299           && (phi_info = loop_vinfo->lookup_stmt (phi))
9300           && STMT_VINFO_RELEVANT_P (phi_info)
9301           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9302           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9303           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9304         {
9305           loop_p loop = gimple_bb (phi)->loop_father;
9306           edge e = loop_latch_edge (loop);
9307           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9308             {
9309               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9310               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9311               gcc_assert (phi_defs.length () == latch_defs.length ());
9312               for (unsigned i = 0; i < phi_defs.length (); ++i)
9313                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9314                              gimple_get_lhs (latch_defs[i]), e,
9315                              gimple_phi_arg_location (phi, e->dest_idx));
9316             }
9317         }
9318 }
9319
9320 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9321    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9322    stmt_vec_info.  */
9323
9324 static bool
9325 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9326                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9327 {
9328   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9329   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9330
9331   if (dump_enabled_p ())
9332     dump_printf_loc (MSG_NOTE, vect_location,
9333                      "------>vectorizing statement: %G", stmt_info->stmt);
9334
9335   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9336     vect_loop_kill_debug_uses (loop, stmt_info);
9337
9338   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9339       && !STMT_VINFO_LIVE_P (stmt_info))
9340     return false;
9341
9342   if (STMT_VINFO_VECTYPE (stmt_info))
9343     {
9344       poly_uint64 nunits
9345         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9346       if (!STMT_SLP_TYPE (stmt_info)
9347           && maybe_ne (nunits, vf)
9348           && dump_enabled_p ())
9349         /* For SLP VF is set according to unrolling factor, and not
9350            to vector size, hence for SLP this print is not valid.  */
9351         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9352     }
9353
9354   /* Pure SLP statements have already been vectorized.  We still need
9355      to apply loop vectorization to hybrid SLP statements.  */
9356   if (PURE_SLP_STMT (stmt_info))
9357     return false;
9358
9359   if (dump_enabled_p ())
9360     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9361
9362   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9363     *seen_store = stmt_info;
9364
9365   return true;
9366 }
9367
9368 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9369    in the hash_map with its corresponding values.  */
9370
9371 static tree
9372 find_in_mapping (tree t, void *context)
9373 {
9374   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9375
9376   tree *value = mapping->get (t);
9377   return value ? *value : t;
9378 }
9379
9380 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9381    original loop that has now been vectorized.
9382
9383    The inits of the data_references need to be advanced with the number of
9384    iterations of the main loop.  This has been computed in vect_do_peeling and
9385    is stored in parameter ADVANCE.  We first restore the data_references
9386    initial offset with the values recored in ORIG_DRS_INIT.
9387
9388    Since the loop_vec_info of this EPILOGUE was constructed for the original
9389    loop, its stmt_vec_infos all point to the original statements.  These need
9390    to be updated to point to their corresponding copies as well as the SSA_NAMES
9391    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9392
9393    The data_reference's connections also need to be updated.  Their
9394    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9395    stmt_vec_infos, their statements need to point to their corresponding copy,
9396    if they are gather loads or scatter stores then their reference needs to be
9397    updated to point to its corresponding copy and finally we set
9398    'base_misaligned' to false as we have already peeled for alignment in the
9399    prologue of the main loop.  */
9400
9401 static void
9402 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9403 {
9404   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9405   auto_vec<gimple *> stmt_worklist;
9406   hash_map<tree,tree> mapping;
9407   gimple *orig_stmt, *new_stmt;
9408   gimple_stmt_iterator epilogue_gsi;
9409   gphi_iterator epilogue_phi_gsi;
9410   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9411   basic_block *epilogue_bbs = get_loop_body (epilogue);
9412   unsigned i;
9413
9414   free (LOOP_VINFO_BBS (epilogue_vinfo));
9415   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9416
9417   /* Advance data_reference's with the number of iterations of the previous
9418      loop and its prologue.  */
9419   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9420
9421
9422   /* The EPILOGUE loop is a copy of the original loop so they share the same
9423      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9424      point to the copied statements.  We also create a mapping of all LHS' in
9425      the original loop and all the LHS' in the EPILOGUE and create worklists to
9426      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9427   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9428     {
9429       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9430            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9431         {
9432           new_stmt = epilogue_phi_gsi.phi ();
9433
9434           gcc_assert (gimple_uid (new_stmt) > 0);
9435           stmt_vinfo
9436             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9437
9438           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9439           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9440
9441           mapping.put (gimple_phi_result (orig_stmt),
9442                        gimple_phi_result (new_stmt));
9443           /* PHI nodes can not have patterns or related statements.  */
9444           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9445                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9446         }
9447
9448       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9449            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9450         {
9451           new_stmt = gsi_stmt (epilogue_gsi);
9452           if (is_gimple_debug (new_stmt))
9453             continue;
9454
9455           gcc_assert (gimple_uid (new_stmt) > 0);
9456           stmt_vinfo
9457             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9458
9459           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9460           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9461
9462           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9463             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9464
9465           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9466             {
9467               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9468               for (gimple_stmt_iterator gsi = gsi_start (seq);
9469                    !gsi_end_p (gsi); gsi_next (&gsi))
9470                 stmt_worklist.safe_push (gsi_stmt (gsi));
9471             }
9472
9473           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9474           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9475             {
9476               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9477               stmt_worklist.safe_push (stmt);
9478               /* Set BB such that the assert in
9479                 'get_initial_def_for_reduction' is able to determine that
9480                 the BB of the related stmt is inside this loop.  */
9481               gimple_set_bb (stmt,
9482                              gimple_bb (new_stmt));
9483               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9484               gcc_assert (related_vinfo == NULL
9485                           || related_vinfo == stmt_vinfo);
9486             }
9487         }
9488     }
9489
9490   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9491      using the original main loop and thus need to be updated to refer to the
9492      cloned variables used in the epilogue.  */
9493   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9494     {
9495       gimple *stmt = stmt_worklist[i];
9496       tree *new_op;
9497
9498       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9499         {
9500           tree op = gimple_op (stmt, j);
9501           if ((new_op = mapping.get(op)))
9502             gimple_set_op (stmt, j, *new_op);
9503           else
9504             {
9505               /* PR92429: The last argument of simplify_replace_tree disables
9506                  folding when replacing arguments.  This is required as
9507                  otherwise you might end up with different statements than the
9508                  ones analyzed in vect_loop_analyze, leading to different
9509                  vectorization.  */
9510               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9511                                           &find_in_mapping, &mapping, false);
9512               gimple_set_op (stmt, j, op);
9513             }
9514         }
9515     }
9516
9517   struct data_reference *dr;
9518   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9519   FOR_EACH_VEC_ELT (datarefs, i, dr)
9520     {
9521       orig_stmt = DR_STMT (dr);
9522       gcc_assert (gimple_uid (orig_stmt) > 0);
9523       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9524       /* Data references for gather loads and scatter stores do not use the
9525          updated offset we set using ADVANCE.  Instead we have to make sure the
9526          reference in the data references point to the corresponding copy of
9527          the original in the epilogue.  */
9528       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9529           == VMAT_GATHER_SCATTER)
9530         {
9531           DR_REF (dr)
9532             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9533                                      &find_in_mapping, &mapping);
9534           DR_BASE_ADDRESS (dr)
9535             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9536                                      &find_in_mapping, &mapping);
9537         }
9538       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9539       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9540       /* The vector size of the epilogue is smaller than that of the main loop
9541          so the alignment is either the same or lower. This means the dr will
9542          thus by definition be aligned.  */
9543       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9544     }
9545
9546   epilogue_vinfo->shared->datarefs_copy.release ();
9547   epilogue_vinfo->shared->save_datarefs ();
9548 }
9549
9550 /* Function vect_transform_loop.
9551
9552    The analysis phase has determined that the loop is vectorizable.
9553    Vectorize the loop - created vectorized stmts to replace the scalar
9554    stmts in the loop, and update the loop exit condition.
9555    Returns scalar epilogue loop if any.  */
9556
9557 class loop *
9558 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9559 {
9560   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9561   class loop *epilogue = NULL;
9562   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9563   int nbbs = loop->num_nodes;
9564   int i;
9565   tree niters_vector = NULL_TREE;
9566   tree step_vector = NULL_TREE;
9567   tree niters_vector_mult_vf = NULL_TREE;
9568   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9569   unsigned int lowest_vf = constant_lower_bound (vf);
9570   gimple *stmt;
9571   bool check_profitability = false;
9572   unsigned int th;
9573
9574   DUMP_VECT_SCOPE ("vec_transform_loop");
9575
9576   loop_vinfo->shared->check_datarefs ();
9577
9578   /* Use the more conservative vectorization threshold.  If the number
9579      of iterations is constant assume the cost check has been performed
9580      by our caller.  If the threshold makes all loops profitable that
9581      run at least the (estimated) vectorization factor number of times
9582      checking is pointless, too.  */
9583   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9584   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9585     {
9586       if (dump_enabled_p ())
9587         dump_printf_loc (MSG_NOTE, vect_location,
9588                          "Profitability threshold is %d loop iterations.\n",
9589                          th);
9590       check_profitability = true;
9591     }
9592
9593   /* Make sure there exists a single-predecessor exit bb.  Do this before
9594      versioning.   */
9595   edge e = single_exit (loop);
9596   if (! single_pred_p (e->dest))
9597     {
9598       split_loop_exit_edge (e, true);
9599       if (dump_enabled_p ())
9600         dump_printf (MSG_NOTE, "split exit edge\n");
9601     }
9602
9603   /* Version the loop first, if required, so the profitability check
9604      comes first.  */
9605
9606   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9607     {
9608       class loop *sloop
9609         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9610       sloop->force_vectorize = false;
9611       check_profitability = false;
9612     }
9613
9614   /* Make sure there exists a single-predecessor exit bb also on the
9615      scalar loop copy.  Do this after versioning but before peeling
9616      so CFG structure is fine for both scalar and if-converted loop
9617      to make slpeel_duplicate_current_defs_from_edges face matched
9618      loop closed PHI nodes on the exit.  */
9619   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9620     {
9621       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9622       if (! single_pred_p (e->dest))
9623         {
9624           split_loop_exit_edge (e, true);
9625           if (dump_enabled_p ())
9626             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9627         }
9628     }
9629
9630   tree niters = vect_build_loop_niters (loop_vinfo);
9631   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9632   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9633   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9634   tree advance;
9635   drs_init_vec orig_drs_init;
9636
9637   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9638                               &step_vector, &niters_vector_mult_vf, th,
9639                               check_profitability, niters_no_overflow,
9640                               &advance);
9641
9642   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9643       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9644     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9645                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9646
9647   if (niters_vector == NULL_TREE)
9648     {
9649       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9650           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9651           && known_eq (lowest_vf, vf))
9652         {
9653           niters_vector
9654             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9655                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9656           step_vector = build_one_cst (TREE_TYPE (niters));
9657         }
9658       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9659         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9660                                      &step_vector, niters_no_overflow);
9661       else
9662         /* vect_do_peeling subtracted the number of peeled prologue
9663            iterations from LOOP_VINFO_NITERS.  */
9664         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9665                                      &niters_vector, &step_vector,
9666                                      niters_no_overflow);
9667     }
9668
9669   /* 1) Make sure the loop header has exactly two entries
9670      2) Make sure we have a preheader basic block.  */
9671
9672   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9673
9674   split_edge (loop_preheader_edge (loop));
9675
9676   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9677     /* This will deal with any possible peeling.  */
9678     vect_prepare_for_masked_peels (loop_vinfo);
9679
9680   /* Schedule the SLP instances first, then handle loop vectorization
9681      below.  */
9682   if (!loop_vinfo->slp_instances.is_empty ())
9683     {
9684       DUMP_VECT_SCOPE ("scheduling SLP instances");
9685       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9686     }
9687
9688   /* FORNOW: the vectorizer supports only loops which body consist
9689      of one basic block (header + empty latch). When the vectorizer will
9690      support more involved loop forms, the order by which the BBs are
9691      traversed need to be reconsidered.  */
9692
9693   for (i = 0; i < nbbs; i++)
9694     {
9695       basic_block bb = bbs[i];
9696       stmt_vec_info stmt_info;
9697
9698       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9699            gsi_next (&si))
9700         {
9701           gphi *phi = si.phi ();
9702           if (dump_enabled_p ())
9703             dump_printf_loc (MSG_NOTE, vect_location,
9704                              "------>vectorizing phi: %G", phi);
9705           stmt_info = loop_vinfo->lookup_stmt (phi);
9706           if (!stmt_info)
9707             continue;
9708
9709           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9710             vect_loop_kill_debug_uses (loop, stmt_info);
9711
9712           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9713               && !STMT_VINFO_LIVE_P (stmt_info))
9714             continue;
9715
9716           if (STMT_VINFO_VECTYPE (stmt_info)
9717               && (maybe_ne
9718                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9719               && dump_enabled_p ())
9720             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9721
9722           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9723                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9724                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9725                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9726                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9727               && ! PURE_SLP_STMT (stmt_info))
9728             {
9729               if (dump_enabled_p ())
9730                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9731               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9732             }
9733         }
9734
9735       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9736            gsi_next (&si))
9737         {
9738           gphi *phi = si.phi ();
9739           stmt_info = loop_vinfo->lookup_stmt (phi);
9740           if (!stmt_info)
9741             continue;
9742
9743           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9744               && !STMT_VINFO_LIVE_P (stmt_info))
9745             continue;
9746
9747           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9748                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9749                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9750                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9751                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9752               && ! PURE_SLP_STMT (stmt_info))
9753             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9754         }
9755
9756       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9757            !gsi_end_p (si);)
9758         {
9759           stmt = gsi_stmt (si);
9760           /* During vectorization remove existing clobber stmts.  */
9761           if (gimple_clobber_p (stmt))
9762             {
9763               unlink_stmt_vdef (stmt);
9764               gsi_remove (&si, true);
9765               release_defs (stmt);
9766             }
9767           else
9768             {
9769               /* Ignore vector stmts created in the outer loop.  */
9770               stmt_info = loop_vinfo->lookup_stmt (stmt);
9771
9772               /* vector stmts created in the outer-loop during vectorization of
9773                  stmts in an inner-loop may not have a stmt_info, and do not
9774                  need to be vectorized.  */
9775               stmt_vec_info seen_store = NULL;
9776               if (stmt_info)
9777                 {
9778                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9779                     {
9780                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9781                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9782                            !gsi_end_p (subsi); gsi_next (&subsi))
9783                         {
9784                           stmt_vec_info pat_stmt_info
9785                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9786                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9787                                                     &si, &seen_store);
9788                         }
9789                       stmt_vec_info pat_stmt_info
9790                         = STMT_VINFO_RELATED_STMT (stmt_info);
9791                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9792                                                     &si, &seen_store))
9793                         maybe_set_vectorized_backedge_value (loop_vinfo,
9794                                                              pat_stmt_info);
9795                     }
9796                   else
9797                     {
9798                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9799                                                     &seen_store))
9800                         maybe_set_vectorized_backedge_value (loop_vinfo,
9801                                                              stmt_info);
9802                     }
9803                 }
9804               gsi_next (&si);
9805               if (seen_store)
9806                 {
9807                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9808                     /* Interleaving.  If IS_STORE is TRUE, the
9809                        vectorization of the interleaving chain was
9810                        completed - free all the stores in the chain.  */
9811                     vect_remove_stores (loop_vinfo,
9812                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9813                   else
9814                     /* Free the attached stmt_vec_info and remove the stmt.  */
9815                     loop_vinfo->remove_stmt (stmt_info);
9816                 }
9817             }
9818         }
9819
9820       /* Stub out scalar statements that must not survive vectorization.
9821          Doing this here helps with grouped statements, or statements that
9822          are involved in patterns.  */
9823       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9824            !gsi_end_p (gsi); gsi_next (&gsi))
9825         {
9826           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9827           if (!call || !gimple_call_internal_p (call))
9828             continue;
9829           internal_fn ifn = gimple_call_internal_fn (call);
9830           if (ifn == IFN_MASK_LOAD)
9831             {
9832               tree lhs = gimple_get_lhs (call);
9833               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9834                 {
9835                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9836                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9837                   gsi_replace (&gsi, new_stmt, true);
9838                 }
9839             }
9840           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9841             {
9842               tree lhs = gimple_get_lhs (call);
9843               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9844                 {
9845                   tree else_arg
9846                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9847                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9848                   gsi_replace (&gsi, new_stmt, true);
9849                 }
9850             }
9851         }
9852     }                           /* BBs in loop */
9853
9854   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9855      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9856   if (integer_onep (step_vector))
9857     niters_no_overflow = true;
9858   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9859                            niters_vector_mult_vf, !niters_no_overflow);
9860
9861   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9862   scale_profile_for_vect_loop (loop, assumed_vf);
9863
9864   /* True if the final iteration might not handle a full vector's
9865      worth of scalar iterations.  */
9866   bool final_iter_may_be_partial
9867     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9868   /* The minimum number of iterations performed by the epilogue.  This
9869      is 1 when peeling for gaps because we always need a final scalar
9870      iteration.  */
9871   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9872   /* +1 to convert latch counts to loop iteration counts,
9873      -min_epilogue_iters to remove iterations that cannot be performed
9874        by the vector code.  */
9875   int bias_for_lowest = 1 - min_epilogue_iters;
9876   int bias_for_assumed = bias_for_lowest;
9877   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9878   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9879     {
9880       /* When the amount of peeling is known at compile time, the first
9881          iteration will have exactly alignment_npeels active elements.
9882          In the worst case it will have at least one.  */
9883       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9884       bias_for_lowest += lowest_vf - min_first_active;
9885       bias_for_assumed += assumed_vf - min_first_active;
9886     }
9887   /* In these calculations the "- 1" converts loop iteration counts
9888      back to latch counts.  */
9889   if (loop->any_upper_bound)
9890     {
9891       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9892       loop->nb_iterations_upper_bound
9893         = (final_iter_may_be_partial
9894            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9895                             lowest_vf) - 1
9896            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9897                              lowest_vf) - 1);
9898       if (main_vinfo)
9899         {
9900           unsigned int bound;
9901           poly_uint64 main_iters
9902             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9903                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9904           main_iters
9905             = upper_bound (main_iters,
9906                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9907           if (can_div_away_from_zero_p (main_iters,
9908                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9909                                         &bound))
9910             loop->nb_iterations_upper_bound
9911               = wi::umin ((widest_int) (bound - 1),
9912                           loop->nb_iterations_upper_bound);
9913       }
9914   }
9915   if (loop->any_likely_upper_bound)
9916     loop->nb_iterations_likely_upper_bound
9917       = (final_iter_may_be_partial
9918          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9919                           + bias_for_lowest, lowest_vf) - 1
9920          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9921                            + bias_for_lowest, lowest_vf) - 1);
9922   if (loop->any_estimate)
9923     loop->nb_iterations_estimate
9924       = (final_iter_may_be_partial
9925          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9926                           assumed_vf) - 1
9927          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9928                            assumed_vf) - 1);
9929
9930   if (dump_enabled_p ())
9931     {
9932       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9933         {
9934           dump_printf_loc (MSG_NOTE, vect_location,
9935                            "LOOP VECTORIZED\n");
9936           if (loop->inner)
9937             dump_printf_loc (MSG_NOTE, vect_location,
9938                              "OUTER LOOP VECTORIZED\n");
9939           dump_printf (MSG_NOTE, "\n");
9940         }
9941       else
9942         dump_printf_loc (MSG_NOTE, vect_location,
9943                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9944                          GET_MODE_NAME (loop_vinfo->vector_mode));
9945     }
9946
9947   /* Loops vectorized with a variable factor won't benefit from
9948      unrolling/peeling.  */
9949   if (!vf.is_constant ())
9950     {
9951       loop->unroll = 1;
9952       if (dump_enabled_p ())
9953         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9954                          " variable-length vectorization factor\n");
9955     }
9956   /* Free SLP instances here because otherwise stmt reference counting
9957      won't work.  */
9958   slp_instance instance;
9959   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9960     vect_free_slp_instance (instance);
9961   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9962   /* Clear-up safelen field since its value is invalid after vectorization
9963      since vectorized loop can have loop-carried dependencies.  */
9964   loop->safelen = 0;
9965
9966   if (epilogue)
9967     {
9968       update_epilogue_loop_vinfo (epilogue, advance);
9969
9970       epilogue->simduid = loop->simduid;
9971       epilogue->force_vectorize = loop->force_vectorize;
9972       epilogue->dont_vectorize = false;
9973     }
9974
9975   return epilogue;
9976 }
9977
9978 /* The code below is trying to perform simple optimization - revert
9979    if-conversion for masked stores, i.e. if the mask of a store is zero
9980    do not perform it and all stored value producers also if possible.
9981    For example,
9982      for (i=0; i<n; i++)
9983        if (c[i])
9984         {
9985           p1[i] += 1;
9986           p2[i] = p3[i] +2;
9987         }
9988    this transformation will produce the following semi-hammock:
9989
9990    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9991      {
9992        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9993        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9994        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9995        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9996        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9997        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9998      }
9999 */
10000
10001 void
10002 optimize_mask_stores (class loop *loop)
10003 {
10004   basic_block *bbs = get_loop_body (loop);
10005   unsigned nbbs = loop->num_nodes;
10006   unsigned i;
10007   basic_block bb;
10008   class loop *bb_loop;
10009   gimple_stmt_iterator gsi;
10010   gimple *stmt;
10011   auto_vec<gimple *> worklist;
10012   auto_purge_vect_location sentinel;
10013
10014   vect_location = find_loop_location (loop);
10015   /* Pick up all masked stores in loop if any.  */
10016   for (i = 0; i < nbbs; i++)
10017     {
10018       bb = bbs[i];
10019       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10020            gsi_next (&gsi))
10021         {
10022           stmt = gsi_stmt (gsi);
10023           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10024             worklist.safe_push (stmt);
10025         }
10026     }
10027
10028   free (bbs);
10029   if (worklist.is_empty ())
10030     return;
10031
10032   /* Loop has masked stores.  */
10033   while (!worklist.is_empty ())
10034     {
10035       gimple *last, *last_store;
10036       edge e, efalse;
10037       tree mask;
10038       basic_block store_bb, join_bb;
10039       gimple_stmt_iterator gsi_to;
10040       tree vdef, new_vdef;
10041       gphi *phi;
10042       tree vectype;
10043       tree zero;
10044
10045       last = worklist.pop ();
10046       mask = gimple_call_arg (last, 2);
10047       bb = gimple_bb (last);
10048       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10049          the same loop as if_bb.  It could be different to LOOP when two
10050          level loop-nest is vectorized and mask_store belongs to the inner
10051          one.  */
10052       e = split_block (bb, last);
10053       bb_loop = bb->loop_father;
10054       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10055       join_bb = e->dest;
10056       store_bb = create_empty_bb (bb);
10057       add_bb_to_loop (store_bb, bb_loop);
10058       e->flags = EDGE_TRUE_VALUE;
10059       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10060       /* Put STORE_BB to likely part.  */
10061       efalse->probability = profile_probability::unlikely ();
10062       store_bb->count = efalse->count ();
10063       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10064       if (dom_info_available_p (CDI_DOMINATORS))
10065         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10066       if (dump_enabled_p ())
10067         dump_printf_loc (MSG_NOTE, vect_location,
10068                          "Create new block %d to sink mask stores.",
10069                          store_bb->index);
10070       /* Create vector comparison with boolean result.  */
10071       vectype = TREE_TYPE (mask);
10072       zero = build_zero_cst (vectype);
10073       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10074       gsi = gsi_last_bb (bb);
10075       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10076       /* Create new PHI node for vdef of the last masked store:
10077          .MEM_2 = VDEF <.MEM_1>
10078          will be converted to
10079          .MEM.3 = VDEF <.MEM_1>
10080          and new PHI node will be created in join bb
10081          .MEM_2 = PHI <.MEM_1, .MEM_3>
10082       */
10083       vdef = gimple_vdef (last);
10084       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10085       gimple_set_vdef (last, new_vdef);
10086       phi = create_phi_node (vdef, join_bb);
10087       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10088
10089       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10090       while (true)
10091         {
10092           gimple_stmt_iterator gsi_from;
10093           gimple *stmt1 = NULL;
10094
10095           /* Move masked store to STORE_BB.  */
10096           last_store = last;
10097           gsi = gsi_for_stmt (last);
10098           gsi_from = gsi;
10099           /* Shift GSI to the previous stmt for further traversal.  */
10100           gsi_prev (&gsi);
10101           gsi_to = gsi_start_bb (store_bb);
10102           gsi_move_before (&gsi_from, &gsi_to);
10103           /* Setup GSI_TO to the non-empty block start.  */
10104           gsi_to = gsi_start_bb (store_bb);
10105           if (dump_enabled_p ())
10106             dump_printf_loc (MSG_NOTE, vect_location,
10107                              "Move stmt to created bb\n%G", last);
10108           /* Move all stored value producers if possible.  */
10109           while (!gsi_end_p (gsi))
10110             {
10111               tree lhs;
10112               imm_use_iterator imm_iter;
10113               use_operand_p use_p;
10114               bool res;
10115
10116               /* Skip debug statements.  */
10117               if (is_gimple_debug (gsi_stmt (gsi)))
10118                 {
10119                   gsi_prev (&gsi);
10120                   continue;
10121                 }
10122               stmt1 = gsi_stmt (gsi);
10123               /* Do not consider statements writing to memory or having
10124                  volatile operand.  */
10125               if (gimple_vdef (stmt1)
10126                   || gimple_has_volatile_ops (stmt1))
10127                 break;
10128               gsi_from = gsi;
10129               gsi_prev (&gsi);
10130               lhs = gimple_get_lhs (stmt1);
10131               if (!lhs)
10132                 break;
10133
10134               /* LHS of vectorized stmt must be SSA_NAME.  */
10135               if (TREE_CODE (lhs) != SSA_NAME)
10136                 break;
10137
10138               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10139                 {
10140                   /* Remove dead scalar statement.  */
10141                   if (has_zero_uses (lhs))
10142                     {
10143                       gsi_remove (&gsi_from, true);
10144                       continue;
10145                     }
10146                 }
10147
10148               /* Check that LHS does not have uses outside of STORE_BB.  */
10149               res = true;
10150               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10151                 {
10152                   gimple *use_stmt;
10153                   use_stmt = USE_STMT (use_p);
10154                   if (is_gimple_debug (use_stmt))
10155                     continue;
10156                   if (gimple_bb (use_stmt) != store_bb)
10157                     {
10158                       res = false;
10159                       break;
10160                     }
10161                 }
10162               if (!res)
10163                 break;
10164
10165               if (gimple_vuse (stmt1)
10166                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10167                 break;
10168
10169               /* Can move STMT1 to STORE_BB.  */
10170               if (dump_enabled_p ())
10171                 dump_printf_loc (MSG_NOTE, vect_location,
10172                                  "Move stmt to created bb\n%G", stmt1);
10173               gsi_move_before (&gsi_from, &gsi_to);
10174               /* Shift GSI_TO for further insertion.  */
10175               gsi_prev (&gsi_to);
10176             }
10177           /* Put other masked stores with the same mask to STORE_BB.  */
10178           if (worklist.is_empty ()
10179               || gimple_call_arg (worklist.last (), 2) != mask
10180               || worklist.last () != stmt1)
10181             break;
10182           last = worklist.pop ();
10183         }
10184       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10185     }
10186 }
10187
10188 /* Decide whether it is possible to use a zero-based induction variable
10189    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10190    the value that the induction variable must be able to hold in order
10191    to ensure that the rgroups eventually have no active vector elements.
10192    Return -1 otherwise.  */
10193
10194 widest_int
10195 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10196 {
10197   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10198   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10199   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10200
10201   /* Calculate the value that the induction variable must be able
10202      to hit in order to ensure that we end the loop with an all-false mask.
10203      This involves adding the maximum number of inactive trailing scalar
10204      iterations.  */
10205   widest_int iv_limit = -1;
10206   if (max_loop_iterations (loop, &iv_limit))
10207     {
10208       if (niters_skip)
10209         {
10210           /* Add the maximum number of skipped iterations to the
10211              maximum iteration count.  */
10212           if (TREE_CODE (niters_skip) == INTEGER_CST)
10213             iv_limit += wi::to_widest (niters_skip);
10214           else
10215             iv_limit += max_vf - 1;
10216         }
10217       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10218         /* Make a conservatively-correct assumption.  */
10219         iv_limit += max_vf - 1;
10220
10221       /* IV_LIMIT is the maximum number of latch iterations, which is also
10222          the maximum in-range IV value.  Round this value down to the previous
10223          vector alignment boundary and then add an extra full iteration.  */
10224       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10225       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10226     }
10227   return iv_limit;
10228 }
10229
10230 /* For the given rgroup_controls RGC, check whether an induction variable
10231    would ever hit a value that produces a set of all-false masks or zero
10232    lengths before wrapping around.  Return true if it's possible to wrap
10233    around before hitting the desirable value, otherwise return false.  */
10234
10235 bool
10236 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10237 {
10238   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10239
10240   if (iv_limit == -1)
10241     return true;
10242
10243   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10244   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10245   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10246
10247   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10248     return true;
10249
10250   return false;
10251 }