gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *, bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf)
 168 {
 169   gimple *stmt = stmt_info->stmt;
 170
 171   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 172        && !STMT_VINFO_LIVE_P (stmt_info))
 173       || gimple_clobber_p (stmt))
 174     {
 175       if (dump_enabled_p ())
 176         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 177       return opt_result::success ();
 178     }
 179
 180   tree stmt_vectype, nunits_vectype;
 181   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 182                                                    &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  Return true on success
 209    or false if something prevented vectorization.  */
 210
 211 static opt_result
 212 vect_determine_vf_for_stmt (vec_info *vinfo,
 213                             stmt_vec_info stmt_info, poly_uint64 *vf)
 214 {
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 217                      stmt_info->stmt);
 218   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 219   if (!res)
 220     return res;
 221
 222   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 223       && STMT_VINFO_RELATED_STMT (stmt_info))
 224     {
 225       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 226       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 227
 228       /* If a pattern statement has def stmts, analyze them too.  */
 229       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 230            !gsi_end_p (si); gsi_next (&si))
 231         {
 232           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 233           if (dump_enabled_p ())
 234             dump_printf_loc (MSG_NOTE, vect_location,
 235                              "==> examining pattern def stmt: %G",
 236                              def_stmt_info->stmt);
 237           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 238           if (!res)
 239             return res;
 240         }
 241
 242       if (dump_enabled_p ())
 243         dump_printf_loc (MSG_NOTE, vect_location,
 244                          "==> examining pattern statement: %G",
 245                          stmt_info->stmt);
 246       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 247       if (!res)
 248         return res;
 249     }
 250
 251   return opt_result::success ();
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static opt_result
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291
 292   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 293
 294   for (i = 0; i < nbbs; i++)
 295     {
 296       basic_block bb = bbs[i];
 297
 298       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 299            gsi_next (&si))
 300         {
 301           phi = si.phi ();
 302           stmt_info = loop_vinfo->lookup_stmt (phi);
 303           if (dump_enabled_p ())
 304             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 305                              phi);
 306
 307           gcc_assert (stmt_info);
 308
 309           if (STMT_VINFO_RELEVANT_P (stmt_info)
 310               || STMT_VINFO_LIVE_P (stmt_info))
 311             {
 312               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 313               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 314
 315               if (dump_enabled_p ())
 316                 dump_printf_loc (MSG_NOTE, vect_location,
 317                                  "get vectype for scalar type:  %T\n",
 318                                  scalar_type);
 319
 320               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 321               if (!vectype)
 322                 return opt_result::failure_at (phi,
 323                                                "not vectorized: unsupported "
 324                                                "data-type %T\n",
 325                                                scalar_type);
 326               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 327
 328               if (dump_enabled_p ())
 329                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 330                                  vectype);
 331
 332               if (dump_enabled_p ())
 333                 {
 334                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 335                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 336                   dump_printf (MSG_NOTE, "\n");
 337                 }
 338
 339               vect_update_max_nunits (&vectorization_factor, vectype);
 340             }
 341         }
 342
 343       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 344            gsi_next (&si))
 345         {
 346           if (is_gimple_debug (gsi_stmt (si)))
 347             continue;
 348           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 349           opt_result res
 350             = vect_determine_vf_for_stmt (loop_vinfo,
 351                                           stmt_info, &vectorization_factor);
 352           if (!res)
 353             return res;
 354         }
 355     }
 356
 357   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 358   if (dump_enabled_p ())
 359     {
 360       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 361       dump_dec (MSG_NOTE, vectorization_factor);
 362       dump_printf (MSG_NOTE, "\n");
 363     }
 364
 365   if (known_le (vectorization_factor, 1U))
 366     return opt_result::failure_at (vect_location,
 367                                    "not vectorized: unsupported data-type\n");
 368   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 369   return opt_result::success ();
 370 }
 371
 372
 373 /* Function vect_is_simple_iv_evolution.
 374
 375    FORNOW: A simple evolution of an induction variables in the loop is
 376    considered a polynomial evolution.  */
 377
 378 static bool
 379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 380                              tree * step)
 381 {
 382   tree init_expr;
 383   tree step_expr;
 384   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 385   basic_block bb;
 386
 387   /* When there is no evolution in this loop, the evolution function
 388      is not "simple".  */
 389   if (evolution_part == NULL_TREE)
 390     return false;
 391
 392   /* When the evolution is a polynomial of degree >= 2
 393      the evolution function is not "simple".  */
 394   if (tree_is_chrec (evolution_part))
 395     return false;
 396
 397   step_expr = evolution_part;
 398   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 399
 400   if (dump_enabled_p ())
 401     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 402                      step_expr, init_expr);
 403
 404   *init = init_expr;
 405   *step = step_expr;
 406
 407   if (TREE_CODE (step_expr) != INTEGER_CST
 408       && (TREE_CODE (step_expr) != SSA_NAME
 409           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 410               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 411           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 412               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 413                   || !flag_associative_math)))
 414       && (TREE_CODE (step_expr) != REAL_CST
 415           || !flag_associative_math))
 416     {
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                          "step unknown.\n");
 420       return false;
 421     }
 422
 423   return true;
 424 }
 425
 426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 427    what we are assuming is a double reduction.  For example, given
 428    a structure like this:
 429
 430       outer1:
 431         x_1 = PHI <x_4(outer2), ...>;
 432         ...
 433
 434       inner:
 435         x_2 = PHI <x_1(outer1), ...>;
 436         ...
 437         x_3 = ...;
 438         ...
 439
 440       outer2:
 441         x_4 = PHI <x_3(inner)>;
 442         ...
 443
 444    outer loop analysis would treat x_1 as a double reduction phi and
 445    this function would then return true for x_2.  */
 446
 447 static bool
 448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 449 {
 450   use_operand_p use_p;
 451   ssa_op_iter op_iter;
 452   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 453     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 454       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 455         return true;
 456   return false;
 457 }
 458
 459 /* Function vect_analyze_scalar_cycles_1.
 460
 461    Examine the cross iteration def-use cycles of scalar variables
 462    in LOOP.  LOOP_VINFO represents the loop that is now being
 463    considered for vectorization (can be LOOP, or an outer-loop
 464    enclosing LOOP).  */
 465
 466 static void
 467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 468 {
 469   basic_block bb = loop->header;
 470   tree init, step;
 471   auto_vec<stmt_vec_info, 64> worklist;
 472   gphi_iterator gsi;
 473   bool double_reduc, reduc_chain;
 474
 475   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 476
 477   /* First - identify all inductions.  Reduction detection assumes that all the
 478      inductions have been identified, therefore, this order must not be
 479      changed.  */
 480   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 481     {
 482       gphi *phi = gsi.phi ();
 483       tree access_fn = NULL;
 484       tree def = PHI_RESULT (phi);
 485       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 489
 490       /* Skip virtual phi's.  The data dependences that are associated with
 491          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 492       if (virtual_operand_p (def))
 493         continue;
 494
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 496
 497       /* Analyze the evolution function.  */
 498       access_fn = analyze_scalar_evolution (loop, def);
 499       if (access_fn)
 500         {
 501           STRIP_NOPS (access_fn);
 502           if (dump_enabled_p ())
 503             dump_printf_loc (MSG_NOTE, vect_location,
 504                              "Access function of PHI: %T\n", access_fn);
 505           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 506             = initial_condition_in_loop_num (access_fn, loop->num);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 508             = evolution_part_in_loop_num (access_fn, loop->num);
 509         }
 510
 511       if (!access_fn
 512           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 513           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 514           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 515               && TREE_CODE (step) != INTEGER_CST))
 516         {
 517           worklist.safe_push (stmt_vinfo);
 518           continue;
 519         }
 520
 521       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522                   != NULL_TREE);
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 524
 525       if (dump_enabled_p ())
 526         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 527       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 528     }
 529
 530
 531   /* Second - identify all reductions and nested cycles.  */
 532   while (worklist.length () > 0)
 533     {
 534       stmt_vec_info stmt_vinfo = worklist.pop ();
 535       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 536       tree def = PHI_RESULT (phi);
 537
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 540
 541       gcc_assert (!virtual_operand_p (def)
 542                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 543
 544       stmt_vec_info reduc_stmt_info
 545         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 546                                     &reduc_chain);
 547       if (reduc_stmt_info)
 548         {
 549           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 550           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 551           if (double_reduc)
 552             {
 553               if (dump_enabled_p ())
 554                 dump_printf_loc (MSG_NOTE, vect_location,
 555                                  "Detected double reduction.\n");
 556
 557               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 558               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 559             }
 560           else
 561             {
 562               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 563                 {
 564                   if (dump_enabled_p ())
 565                     dump_printf_loc (MSG_NOTE, vect_location,
 566                                      "Detected vectorizable nested cycle.\n");
 567
 568                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 569                 }
 570               else
 571                 {
 572                   if (dump_enabled_p ())
 573                     dump_printf_loc (MSG_NOTE, vect_location,
 574                                      "Detected reduction.\n");
 575
 576                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 577                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 578                   /* Store the reduction cycles for possible vectorization in
 579                      loop-aware SLP if it was not detected as reduction
 580                      chain.  */
 581                   if (! reduc_chain)
 582                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 583                       (reduc_stmt_info);
 584                 }
 585             }
 586         }
 587       else
 588         if (dump_enabled_p ())
 589           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 590                            "Unknown def-use cycle pattern.\n");
 591     }
 592 }
 593
 594
 595 /* Function vect_analyze_scalar_cycles.
 596
 597    Examine the cross iteration def-use cycles of scalar variables, by
 598    analyzing the loop-header PHIs of scalar variables.  Classify each
 599    cycle as one of the following: invariant, induction, reduction, unknown.
 600    We do that for the loop represented by LOOP_VINFO, and also to its
 601    inner-loop, if exists.
 602    Examples for scalar cycles:
 603
 604    Example1: reduction:
 605
 606               loop1:
 607               for (i=0; i<N; i++)
 608                  sum += a[i];
 609
 610    Example2: induction:
 611
 612               loop2:
 613               for (i=0; i<N; i++)
 614                  a[i] = i;  */
 615
 616 static void
 617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 618 {
 619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 620
 621   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 622
 623   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 624      Reductions in such inner-loop therefore have different properties than
 625      the reductions in the nest that gets vectorized:
 626      1. When vectorized, they are executed in the same order as in the original
 627         scalar loop, so we can't change the order of computation when
 628         vectorizing them.
 629      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 630         current checks are too strict.  */
 631
 632   if (loop->inner)
 633     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 634 }
 635
 636 /* Transfer group and reduction information from STMT_INFO to its
 637    pattern stmt.  */
 638
 639 static void
 640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 641 {
 642   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 643   stmt_vec_info stmtp;
 644   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 645               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 646   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 647   do
 648     {
 649       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 650       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 651                            == STMT_VINFO_DEF_TYPE (stmt_info));
 652       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 653       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 654       if (stmt_info)
 655         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 656           = STMT_VINFO_RELATED_STMT (stmt_info);
 657     }
 658   while (stmt_info);
 659 }
 660
 661 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 662
 663 static void
 664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 665 {
 666   stmt_vec_info first;
 667   unsigned i;
 668
 669   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 670     {
 671       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672       while (next)
 673         {
 674           if ((STMT_VINFO_IN_PATTERN_P (next)
 675                != STMT_VINFO_IN_PATTERN_P (first))
 676               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 677             break;
 678           next = REDUC_GROUP_NEXT_ELEMENT (next);
 679         }
 680       /* If all reduction chain members are well-formed patterns adjust
 681          the group to group the pattern stmts instead.  */
 682       if (! next
 683           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 684         {
 685           if (STMT_VINFO_IN_PATTERN_P (first))
 686             {
 687               vect_fixup_reduc_chain (first);
 688               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 689                 = STMT_VINFO_RELATED_STMT (first);
 690             }
 691         }
 692       /* If not all stmt in the chain are patterns or if we failed
 693          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 694          it as regular reduction instead.  */
 695       else
 696         {
 697           stmt_vec_info vinfo = first;
 698           stmt_vec_info last = NULL;
 699           while (vinfo)
 700             {
 701               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 702               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 703               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 704               last = vinfo;
 705               vinfo = next;
 706             }
 707           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 708             = vect_internal_def;
 709           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 710           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 711           --i;
 712         }
 713     }
 714 }
 715
 716 /* Function vect_get_loop_niters.
 717
 718    Determine how many iterations the loop is executed and place it
 719    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 720    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 721    niter information holds in ASSUMPTIONS.
 722
 723    Return the loop exit condition.  */
 724
 725
 726 static gcond *
 727 vect_get_loop_niters (class loop *loop, tree *assumptions,
 728                       tree *number_of_iterations, tree *number_of_iterationsm1)
 729 {
 730   edge exit = single_exit (loop);
 731   class tree_niter_desc niter_desc;
 732   tree niter_assumptions, niter, may_be_zero;
 733   gcond *cond = get_loop_exit_condition (loop);
 734
 735   *assumptions = boolean_true_node;
 736   *number_of_iterationsm1 = chrec_dont_know;
 737   *number_of_iterations = chrec_dont_know;
 738   DUMP_VECT_SCOPE ("get_loop_niters");
 739
 740   if (!exit)
 741     return cond;
 742
 743   may_be_zero = NULL_TREE;
 744   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 745       || chrec_contains_undetermined (niter_desc.niter))
 746     return cond;
 747
 748   niter_assumptions = niter_desc.assumptions;
 749   may_be_zero = niter_desc.may_be_zero;
 750   niter = niter_desc.niter;
 751
 752   if (may_be_zero && integer_zerop (may_be_zero))
 753     may_be_zero = NULL_TREE;
 754
 755   if (may_be_zero)
 756     {
 757       if (COMPARISON_CLASS_P (may_be_zero))
 758         {
 759           /* Try to combine may_be_zero with assumptions, this can simplify
 760              computation of niter expression.  */
 761           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 762             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 763                                              niter_assumptions,
 764                                              fold_build1 (TRUTH_NOT_EXPR,
 765                                                           boolean_type_node,
 766                                                           may_be_zero));
 767           else
 768             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 769                                  build_int_cst (TREE_TYPE (niter), 0),
 770                                  rewrite_to_non_trapping_overflow (niter));
 771
 772           may_be_zero = NULL_TREE;
 773         }
 774       else if (integer_nonzerop (may_be_zero))
 775         {
 776           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 777           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 778           return cond;
 779         }
 780       else
 781         return cond;
 782     }
 783
 784   *assumptions = niter_assumptions;
 785   *number_of_iterationsm1 = niter;
 786
 787   /* We want the number of loop header executions which is the number
 788      of latch executions plus one.
 789      ???  For UINT_MAX latch executions this number overflows to zero
 790      for loops like do { n++; } while (n != 0);  */
 791   if (niter && !chrec_contains_undetermined (niter))
 792     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 793                           build_int_cst (TREE_TYPE (niter), 1));
 794   *number_of_iterations = niter;
 795
 796   return cond;
 797 }
 798
 799 /* Function bb_in_loop_p
 800
 801    Used as predicate for dfs order traversal of the loop bbs.  */
 802
 803 static bool
 804 bb_in_loop_p (const_basic_block bb, const void *data)
 805 {
 806   const class loop *const loop = (const class loop *)data;
 807   if (flow_bb_inside_loop_p (loop, bb))
 808     return true;
 809   return false;
 810 }
 811
 812
 813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 814    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 815
 816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 817   : vec_info (vec_info::loop, shared),
 818     loop (loop_in),
 819     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 820     num_itersm1 (NULL_TREE),
 821     num_iters (NULL_TREE),
 822     num_iters_unchanged (NULL_TREE),
 823     num_iters_assumptions (NULL_TREE),
 824     th (0),
 825     versioning_threshold (0),
 826     vectorization_factor (0),
 827     main_loop_edge (nullptr),
 828     skip_main_loop_edge (nullptr),
 829     skip_this_loop_edge (nullptr),
 830     reusable_accumulators (),
 831     max_vectorization_factor (0),
 832     mask_skip_niters (NULL_TREE),
 833     rgroup_compare_type (NULL_TREE),
 834     simd_if_cond (NULL_TREE),
 835     unaligned_dr (NULL),
 836     peeling_for_alignment (0),
 837     ptr_mask (0),
 838     ivexpr_map (NULL),
 839     scan_map (NULL),
 840     slp_unrolling_factor (1),
 841     single_scalar_iteration_cost (0),
 842     vec_outside_cost (0),
 843     vec_inside_cost (0),
 844     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 845     vectorizable (false),
 846     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 847     using_partial_vectors_p (false),
 848     epil_using_partial_vectors_p (false),
 849     peeling_for_gaps (false),
 850     peeling_for_niter (false),
 851     no_data_dependencies (false),
 852     has_mask_store (false),
 853     scalar_loop_scaling (profile_probability::uninitialized ()),
 854     scalar_loop (NULL),
 855     orig_loop_info (NULL)
 856 {
 857   /* CHECKME: We want to visit all BBs before their successors (except for
 858      latch blocks, for which this assertion wouldn't hold).  In the simple
 859      case of the loop forms we allow, a dfs order of the BBs would the same
 860      as reversed postorder traversal, so we are safe.  */
 861
 862   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 863                                           bbs, loop->num_nodes, loop);
 864   gcc_assert (nbbs == loop->num_nodes);
 865
 866   for (unsigned int i = 0; i < nbbs; i++)
 867     {
 868       basic_block bb = bbs[i];
 869       gimple_stmt_iterator si;
 870
 871       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 872         {
 873           gimple *phi = gsi_stmt (si);
 874           gimple_set_uid (phi, 0);
 875           add_stmt (phi);
 876         }
 877
 878       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879         {
 880           gimple *stmt = gsi_stmt (si);
 881           gimple_set_uid (stmt, 0);
 882           if (is_gimple_debug (stmt))
 883             continue;
 884           add_stmt (stmt);
 885           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 886              third argument is the #pragma omp simd if (x) condition, when 0,
 887              loop shouldn't be vectorized, when non-zero constant, it should
 888              be vectorized normally, otherwise versioned with vectorized loop
 889              done if the condition is non-zero at runtime.  */
 890           if (loop_in->simduid
 891               && is_gimple_call (stmt)
 892               && gimple_call_internal_p (stmt)
 893               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 894               && gimple_call_num_args (stmt) >= 3
 895               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 896               && (loop_in->simduid
 897                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 898             {
 899               tree arg = gimple_call_arg (stmt, 2);
 900               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 901                 simd_if_cond = arg;
 902               else
 903                 gcc_assert (integer_nonzerop (arg));
 904             }
 905         }
 906     }
 907
 908   epilogue_vinfos.create (6);
 909 }
 910
 911 /* Free all levels of rgroup CONTROLS.  */
 912
 913 void
 914 release_vec_loop_controls (vec<rgroup_controls> *controls)
 915 {
 916   rgroup_controls *rgc;
 917   unsigned int i;
 918   FOR_EACH_VEC_ELT (*controls, i, rgc)
 919     rgc->controls.release ();
 920   controls->release ();
 921 }
 922
 923 /* Free all memory used by the _loop_vec_info, as well as all the
 924    stmt_vec_info structs of all the stmts in the loop.  */
 925
 926 _loop_vec_info::~_loop_vec_info ()
 927 {
 928   free (bbs);
 929
 930   release_vec_loop_controls (&masks);
 931   release_vec_loop_controls (&lens);
 932   delete ivexpr_map;
 933   delete scan_map;
 934   epilogue_vinfos.release ();
 935
 936   /* When we release an epiloge vinfo that we do not intend to use
 937      avoid clearing AUX of the main loop which should continue to
 938      point to the main loop vinfo since otherwise we'll leak that.  */
 939   if (loop->aux == this)
 940     loop->aux = NULL;
 941 }
 942
 943 /* Return an invariant or register for EXPR and emit necessary
 944    computations in the LOOP_VINFO loop preheader.  */
 945
 946 tree
 947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 948 {
 949   if (is_gimple_reg (expr)
 950       || is_gimple_min_invariant (expr))
 951     return expr;
 952
 953   if (! loop_vinfo->ivexpr_map)
 954     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 955   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 956   if (! cached)
 957     {
 958       gimple_seq stmts = NULL;
 959       cached = force_gimple_operand (unshare_expr (expr),
 960                                      &stmts, true, NULL_TREE);
 961       if (stmts)
 962         {
 963           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 964           gsi_insert_seq_on_edge_immediate (e, stmts);
 965         }
 966     }
 967   return cached;
 968 }
 969
 970 /* Return true if we can use CMP_TYPE as the comparison type to produce
 971    all masks required to mask LOOP_VINFO.  */
 972
 973 static bool
 974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 975 {
 976   rgroup_controls *rgm;
 977   unsigned int i;
 978   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 979     if (rgm->type != NULL_TREE
 980         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 981                                             cmp_type, rgm->type,
 982                                             OPTIMIZE_FOR_SPEED))
 983       return false;
 984   return true;
 985 }
 986
 987 /* Calculate the maximum number of scalars per iteration for every
 988    rgroup in LOOP_VINFO.  */
 989
 990 static unsigned int
 991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 992 {
 993   unsigned int res = 1;
 994   unsigned int i;
 995   rgroup_controls *rgm;
 996   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 997     res = MAX (res, rgm->max_nscalars_per_iter);
 998   return res;
 999 }
1000
1001 /* Calculate the minimum precision necessary to represent:
1002
1003       MAX_NITERS * FACTOR
1004
1005    as an unsigned integer, where MAX_NITERS is the maximum number of
1006    loop header iterations for the original scalar form of LOOP_VINFO.  */
1007
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1010 {
1011   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012
1013   /* Get the maximum number of iterations that is representable
1014      in the counter type.  */
1015   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1017
1018   /* Get a more refined estimate for the number of iterations.  */
1019   widest_int max_back_edges;
1020   if (max_loop_iterations (loop, &max_back_edges))
1021     max_ni = wi::smin (max_ni, max_back_edges + 1);
1022
1023   /* Work out how many bits we need to represent the limit.  */
1024   return wi::min_precision (max_ni * factor, UNSIGNED);
1025 }
1026
1027 /* True if the loop needs peeling or partial vectors when vectorized.  */
1028
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1031 {
1032   unsigned HOST_WIDE_INT const_vf;
1033   HOST_WIDE_INT max_niter
1034     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1035
1036   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039                                           (loop_vinfo));
1040
1041   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1043     {
1044       /* Work out the (constant) number of iterations that need to be
1045          peeled for reasons other than niters.  */
1046       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048         peel_niter += 1;
1049       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051         return true;
1052     }
1053   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054       /* ??? When peeling for gaps but not alignment, we could
1055          try to check whether the (variable) niters is known to be
1056          VF * N + 1.  That's something of a niche case though.  */
1057       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060            < (unsigned) exact_log2 (const_vf))
1061           /* In case of versioning, check if the maximum number of
1062              iterations is greater than th.  If they are identical,
1063              the epilogue is unnecessary.  */
1064           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065               || ((unsigned HOST_WIDE_INT) max_niter
1066                   > (th / const_vf) * const_vf))))
1067     return true;
1068
1069   return false;
1070 }
1071
1072 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1073    whether we can actually generate the masks required.  Return true if so,
1074    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1075
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1078 {
1079   unsigned int min_ni_width;
1080   unsigned int max_nscalars_per_iter
1081     = vect_get_max_nscalars_per_iter (loop_vinfo);
1082
1083   /* Use a normal loop if there are no statements that need masking.
1084      This only happens in rare degenerate cases: it means that the loop
1085      has no loads, no stores, and no live-out values.  */
1086   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087     return false;
1088
1089   /* Work out how many bits we need to represent the limit.  */
1090   min_ni_width
1091     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1092
1093   /* Find a scalar mode for which WHILE_ULT is supported.  */
1094   opt_scalar_int_mode cmp_mode_iter;
1095   tree cmp_type = NULL_TREE;
1096   tree iv_type = NULL_TREE;
1097   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098   unsigned int iv_precision = UINT_MAX;
1099
1100   if (iv_limit != -1)
1101     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102                                       UNSIGNED);
1103
1104   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1105     {
1106       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107       if (cmp_bits >= min_ni_width
1108           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1109         {
1110           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111           if (this_type
1112               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1113             {
1114               /* Although we could stop as soon as we find a valid mode,
1115                  there are at least two reasons why that's not always the
1116                  best choice:
1117
1118                  - An IV that's Pmode or wider is more likely to be reusable
1119                    in address calculations than an IV that's narrower than
1120                    Pmode.
1121
1122                  - Doing the comparison in IV_PRECISION or wider allows
1123                    a natural 0-based IV, whereas using a narrower comparison
1124                    type requires mitigations against wrap-around.
1125
1126                  Conversely, if the IV limit is variable, doing the comparison
1127                  in a wider type than the original type can introduce
1128                  unnecessary extensions, so picking the widest valid mode
1129                  is not always a good choice either.
1130
1131                  Here we prefer the first IV type that's Pmode or wider,
1132                  and the first comparison type that's IV_PRECISION or wider.
1133                  (The comparison type must be no wider than the IV type,
1134                  to avoid extensions in the vector loop.)
1135
1136                  ??? We might want to try continuing beyond Pmode for ILP32
1137                  targets if CMP_BITS < IV_PRECISION.  */
1138               iv_type = this_type;
1139               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140                 cmp_type = this_type;
1141               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142                 break;
1143             }
1144         }
1145     }
1146
1147   if (!cmp_type)
1148     return false;
1149
1150   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152   return true;
1153 }
1154
1155 /* Check whether we can use vector access with length based on precison
1156    comparison.  So far, to keep it simple, we only allow the case that the
1157    precision of the target supported length is larger than the precision
1158    required by loop niters.  */
1159
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1162 {
1163   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164     return false;
1165
1166   unsigned int max_nitems_per_iter = 1;
1167   unsigned int i;
1168   rgroup_controls *rgl;
1169   /* Find the maximum number of items per iteration for every rgroup.  */
1170   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1171     {
1172       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1174     }
1175
1176   /* Work out how many bits we need to represent the length limit.  */
1177   unsigned int min_ni_prec
1178     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1179
1180   /* Now use the maximum of below precisions for one suitable IV type:
1181      - the IV's natural precision
1182      - the precision needed to hold: the maximum number of scalar
1183        iterations multiplied by the scale factor (min_ni_prec above)
1184      - the Pmode precision
1185
1186      If min_ni_prec is less than the precision of the current niters,
1187      we perfer to still use the niters type.  Prefer to use Pmode and
1188      wider IV to avoid narrow conversions.  */
1189
1190   unsigned int ni_prec
1191     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192   min_ni_prec = MAX (min_ni_prec, ni_prec);
1193   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1194
1195   tree iv_type = NULL_TREE;
1196   opt_scalar_int_mode tmode_iter;
1197   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1198     {
1199       scalar_mode tmode = tmode_iter.require ();
1200       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1201
1202       /* ??? Do we really want to construct one IV whose precision exceeds
1203          BITS_PER_WORD?  */
1204       if (tbits > BITS_PER_WORD)
1205         break;
1206
1207       /* Find the first available standard integral type.  */
1208       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1209         {
1210           iv_type = build_nonstandard_integer_type (tbits, true);
1211           break;
1212         }
1213     }
1214
1215   if (!iv_type)
1216     {
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                          "can't vectorize with length-based partial vectors"
1220                          " because there is no suitable iv type.\n");
1221       return false;
1222     }
1223
1224   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1226
1227   return true;
1228 }
1229
1230 /* Calculate the cost of one scalar iteration of the loop.  */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1233 {
1234   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236   int nbbs = loop->num_nodes, factor;
1237   int innerloop_iters, i;
1238
1239   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1240
1241   /* Gather costs for statements in the scalar loop.  */
1242
1243   /* FORNOW.  */
1244   innerloop_iters = 1;
1245   if (loop->inner)
1246     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1247
1248   for (i = 0; i < nbbs; i++)
1249     {
1250       gimple_stmt_iterator si;
1251       basic_block bb = bbs[i];
1252
1253       if (bb->loop_father == loop->inner)
1254         factor = innerloop_iters;
1255       else
1256         factor = 1;
1257
1258       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1259         {
1260           gimple *stmt = gsi_stmt (si);
1261           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1262
1263           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264             continue;
1265
1266           /* Skip stmts that are not vectorized inside the loop.  */
1267           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269               && (!STMT_VINFO_LIVE_P (vstmt_info)
1270                   || !VECTORIZABLE_CYCLE_DEF
1271                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272             continue;
1273
1274           vect_cost_for_stmt kind;
1275           if (STMT_VINFO_DATA_REF (stmt_info))
1276             {
1277               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278                kind = scalar_load;
1279              else
1280                kind = scalar_store;
1281             }
1282           else if (vect_nop_conversion_p (stmt_info))
1283             continue;
1284           else
1285             kind = scalar_stmt;
1286
1287           /* We are using vect_prologue here to avoid scaling twice
1288              by the inner loop factor.  */
1289           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1290                             factor, kind, stmt_info, 0, vect_prologue);
1291         }
1292     }
1293
1294   /* Now accumulate cost.  */
1295   vector_costs *target_cost_data = init_cost (loop_vinfo, true);
1296   stmt_info_for_cost *si;
1297   int j;
1298   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1299                     j, si)
1300     (void) add_stmt_cost (target_cost_data, si->count,
1301                           si->kind, si->stmt_info, si->vectype,
1302                           si->misalign, si->where);
1303   unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
1304   finish_cost (target_cost_data, &prologue_cost, &body_cost,
1305                &epilogue_cost);
1306   delete target_cost_data;
1307   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1308     = prologue_cost + body_cost + epilogue_cost;
1309 }
1310
1311
1312 /* Function vect_analyze_loop_form.
1313
1314    Verify that certain CFG restrictions hold, including:
1315    - the loop has a pre-header
1316    - the loop has a single entry and exit
1317    - the loop exit condition is simple enough
1318    - the number of iterations can be analyzed, i.e, a countable loop.  The
1319      niter could be analyzed under some assumptions.  */
1320
1321 opt_result
1322 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1323 {
1324   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1325
1326   /* Different restrictions apply when we are considering an inner-most loop,
1327      vs. an outer (nested) loop.
1328      (FORNOW. May want to relax some of these restrictions in the future).  */
1329
1330   info->inner_loop_cond = NULL;
1331   if (!loop->inner)
1332     {
1333       /* Inner-most loop.  We currently require that the number of BBs is
1334          exactly 2 (the header and latch).  Vectorizable inner-most loops
1335          look like this:
1336
1337                         (pre-header)
1338                            |
1339                           header <--------+
1340                            | |            |
1341                            | +--> latch --+
1342                            |
1343                         (exit-bb)  */
1344
1345       if (loop->num_nodes != 2)
1346         return opt_result::failure_at (vect_location,
1347                                        "not vectorized:"
1348                                        " control flow in loop.\n");
1349
1350       if (empty_block_p (loop->header))
1351         return opt_result::failure_at (vect_location,
1352                                        "not vectorized: empty loop.\n");
1353     }
1354   else
1355     {
1356       class loop *innerloop = loop->inner;
1357       edge entryedge;
1358
1359       /* Nested loop. We currently require that the loop is doubly-nested,
1360          contains a single inner loop, and the number of BBs is exactly 5.
1361          Vectorizable outer-loops look like this:
1362
1363                         (pre-header)
1364                            |
1365                           header <---+
1366                            |         |
1367                           inner-loop |
1368                            |         |
1369                           tail ------+
1370                            |
1371                         (exit-bb)
1372
1373          The inner-loop has the properties expected of inner-most loops
1374          as described above.  */
1375
1376       if ((loop->inner)->inner || (loop->inner)->next)
1377         return opt_result::failure_at (vect_location,
1378                                        "not vectorized:"
1379                                        " multiple nested loops.\n");
1380
1381       if (loop->num_nodes != 5)
1382         return opt_result::failure_at (vect_location,
1383                                        "not vectorized:"
1384                                        " control flow in loop.\n");
1385
1386       entryedge = loop_preheader_edge (innerloop);
1387       if (entryedge->src != loop->header
1388           || !single_exit (innerloop)
1389           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1390         return opt_result::failure_at (vect_location,
1391                                        "not vectorized:"
1392                                        " unsupported outerloop form.\n");
1393
1394       /* Analyze the inner-loop.  */
1395       vect_loop_form_info inner;
1396       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1397       if (!res)
1398         {
1399           if (dump_enabled_p ())
1400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                              "not vectorized: Bad inner loop.\n");
1402           return res;
1403         }
1404
1405       /* Don't support analyzing niter under assumptions for inner
1406          loop.  */
1407       if (!integer_onep (inner.assumptions))
1408         return opt_result::failure_at (vect_location,
1409                                        "not vectorized: Bad inner loop.\n");
1410
1411       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1412         return opt_result::failure_at (vect_location,
1413                                        "not vectorized: inner-loop count not"
1414                                        " invariant.\n");
1415
1416       if (dump_enabled_p ())
1417         dump_printf_loc (MSG_NOTE, vect_location,
1418                          "Considering outer-loop vectorization.\n");
1419       info->inner_loop_cond = inner.loop_cond;
1420     }
1421
1422   if (!single_exit (loop))
1423     return opt_result::failure_at (vect_location,
1424                                    "not vectorized: multiple exits.\n");
1425   if (EDGE_COUNT (loop->header->preds) != 2)
1426     return opt_result::failure_at (vect_location,
1427                                    "not vectorized:"
1428                                    " too many incoming edges.\n");
1429
1430   /* We assume that the loop exit condition is at the end of the loop. i.e,
1431      that the loop is represented as a do-while (with a proper if-guard
1432      before the loop if needed), where the loop header contains all the
1433      executable statements, and the latch is empty.  */
1434   if (!empty_block_p (loop->latch)
1435       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1436     return opt_result::failure_at (vect_location,
1437                                    "not vectorized: latch block not empty.\n");
1438
1439   /* Make sure the exit is not abnormal.  */
1440   edge e = single_exit (loop);
1441   if (e->flags & EDGE_ABNORMAL)
1442     return opt_result::failure_at (vect_location,
1443                                    "not vectorized:"
1444                                    " abnormal loop exit edge.\n");
1445
1446   info->loop_cond
1447     = vect_get_loop_niters (loop, &info->assumptions,
1448                             &info->number_of_iterations,
1449                             &info->number_of_iterationsm1);
1450   if (!info->loop_cond)
1451     return opt_result::failure_at
1452       (vect_location,
1453        "not vectorized: complicated exit condition.\n");
1454
1455   if (integer_zerop (info->assumptions)
1456       || !info->number_of_iterations
1457       || chrec_contains_undetermined (info->number_of_iterations))
1458     return opt_result::failure_at
1459       (info->loop_cond,
1460        "not vectorized: number of iterations cannot be computed.\n");
1461
1462   if (integer_zerop (info->number_of_iterations))
1463     return opt_result::failure_at
1464       (info->loop_cond,
1465        "not vectorized: number of iterations = 0.\n");
1466
1467   return opt_result::success ();
1468 }
1469
1470 /* Create a loop_vec_info for LOOP with SHARED and the
1471    vect_analyze_loop_form result.  */
1472
1473 loop_vec_info
1474 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1475                         const vect_loop_form_info *info)
1476 {
1477   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1478   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1479   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1480   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1481   if (!integer_onep (info->assumptions))
1482     {
1483       /* We consider to vectorize this loop by versioning it under
1484          some assumptions.  In order to do this, we need to clear
1485          existing information computed by scev and niter analyzer.  */
1486       scev_reset_htab ();
1487       free_numbers_of_iterations_estimates (loop);
1488       /* Also set flag for this loop so that following scev and niter
1489          analysis are done under the assumptions.  */
1490       loop_constraint_set (loop, LOOP_C_FINITE);
1491       /* Also record the assumptions for versioning.  */
1492       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1493     }
1494
1495   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1496     {
1497       if (dump_enabled_p ())
1498         {
1499           dump_printf_loc (MSG_NOTE, vect_location,
1500                            "Symbolic number of iterations is ");
1501           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1502           dump_printf (MSG_NOTE, "\n");
1503         }
1504     }
1505
1506   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1507   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508   if (info->inner_loop_cond)
1509     {
1510       stmt_vec_info inner_loop_cond_info
1511         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1512       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1513       /* If we have an estimate on the number of iterations of the inner
1514          loop use that to limit the scale for costing, otherwise use
1515          --param vect-inner-loop-cost-factor literally.  */
1516       widest_int nit;
1517       if (estimated_stmt_executions (loop->inner, &nit))
1518         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1519           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1520     }
1521
1522   return loop_vinfo;
1523 }
1524
1525
1526
1527 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1528    statements update the vectorization factor.  */
1529
1530 static void
1531 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1532 {
1533   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1534   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1535   int nbbs = loop->num_nodes;
1536   poly_uint64 vectorization_factor;
1537   int i;
1538
1539   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1540
1541   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1542   gcc_assert (known_ne (vectorization_factor, 0U));
1543
1544   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1545      vectorization factor of the loop is the unrolling factor required by
1546      the SLP instances.  If that unrolling factor is 1, we say, that we
1547      perform pure SLP on loop - cross iteration parallelism is not
1548      exploited.  */
1549   bool only_slp_in_loop = true;
1550   for (i = 0; i < nbbs; i++)
1551     {
1552       basic_block bb = bbs[i];
1553       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1554            gsi_next (&si))
1555         {
1556           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1557           if (!stmt_info)
1558             continue;
1559           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1560                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1561               && !PURE_SLP_STMT (stmt_info))
1562             /* STMT needs both SLP and loop-based vectorization.  */
1563             only_slp_in_loop = false;
1564         }
1565       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566            gsi_next (&si))
1567         {
1568           if (is_gimple_debug (gsi_stmt (si)))
1569             continue;
1570           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1571           stmt_info = vect_stmt_to_vectorize (stmt_info);
1572           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574               && !PURE_SLP_STMT (stmt_info))
1575             /* STMT needs both SLP and loop-based vectorization.  */
1576             only_slp_in_loop = false;
1577         }
1578     }
1579
1580   if (only_slp_in_loop)
1581     {
1582       if (dump_enabled_p ())
1583         dump_printf_loc (MSG_NOTE, vect_location,
1584                          "Loop contains only SLP stmts\n");
1585       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1586     }
1587   else
1588     {
1589       if (dump_enabled_p ())
1590         dump_printf_loc (MSG_NOTE, vect_location,
1591                          "Loop contains SLP and non-SLP stmts\n");
1592       /* Both the vectorization factor and unroll factor have the form
1593          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1594          so they must have a common multiple.  */
1595       vectorization_factor
1596         = force_common_multiple (vectorization_factor,
1597                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1598     }
1599
1600   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1601   if (dump_enabled_p ())
1602     {
1603       dump_printf_loc (MSG_NOTE, vect_location,
1604                        "Updating vectorization factor to ");
1605       dump_dec (MSG_NOTE, vectorization_factor);
1606       dump_printf (MSG_NOTE, ".\n");
1607     }
1608 }
1609
1610 /* Return true if STMT_INFO describes a double reduction phi and if
1611    the other phi in the reduction is also relevant for vectorization.
1612    This rejects cases such as:
1613
1614       outer1:
1615         x_1 = PHI <x_3(outer2), ...>;
1616         ...
1617
1618       inner:
1619         x_2 = ...;
1620         ...
1621
1622       outer2:
1623         x_3 = PHI <x_2(inner)>;
1624
1625    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1626
1627 static bool
1628 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1629 {
1630   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1631     return false;
1632
1633   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1634 }
1635
1636 /* Function vect_analyze_loop_operations.
1637
1638    Scan the loop stmts and make sure they are all vectorizable.  */
1639
1640 static opt_result
1641 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1642 {
1643   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1644   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1645   int nbbs = loop->num_nodes;
1646   int i;
1647   stmt_vec_info stmt_info;
1648   bool need_to_vectorize = false;
1649   bool ok;
1650
1651   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1652
1653   auto_vec<stmt_info_for_cost> cost_vec;
1654
1655   for (i = 0; i < nbbs; i++)
1656     {
1657       basic_block bb = bbs[i];
1658
1659       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1660            gsi_next (&si))
1661         {
1662           gphi *phi = si.phi ();
1663           ok = true;
1664
1665           stmt_info = loop_vinfo->lookup_stmt (phi);
1666           if (dump_enabled_p ())
1667             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1668           if (virtual_operand_p (gimple_phi_result (phi)))
1669             continue;
1670
1671           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1672              (i.e., a phi in the tail of the outer-loop).  */
1673           if (! is_loop_header_bb_p (bb))
1674             {
1675               /* FORNOW: we currently don't support the case that these phis
1676                  are not used in the outerloop (unless it is double reduction,
1677                  i.e., this phi is vect_reduction_def), cause this case
1678                  requires to actually do something here.  */
1679               if (STMT_VINFO_LIVE_P (stmt_info)
1680                   && !vect_active_double_reduction_p (stmt_info))
1681                 return opt_result::failure_at (phi,
1682                                                "Unsupported loop-closed phi"
1683                                                " in outer-loop.\n");
1684
1685               /* If PHI is used in the outer loop, we check that its operand
1686                  is defined in the inner loop.  */
1687               if (STMT_VINFO_RELEVANT_P (stmt_info))
1688                 {
1689                   tree phi_op;
1690
1691                   if (gimple_phi_num_args (phi) != 1)
1692                     return opt_result::failure_at (phi, "unsupported phi");
1693
1694                   phi_op = PHI_ARG_DEF (phi, 0);
1695                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1696                   if (!op_def_info)
1697                     return opt_result::failure_at (phi, "unsupported phi\n");
1698
1699                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1700                       && (STMT_VINFO_RELEVANT (op_def_info)
1701                           != vect_used_in_outer_by_reduction))
1702                     return opt_result::failure_at (phi, "unsupported phi\n");
1703
1704                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1705                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1706                            == vect_double_reduction_def))
1707                       && !vectorizable_lc_phi (loop_vinfo,
1708                                                stmt_info, NULL, NULL))
1709                     return opt_result::failure_at (phi, "unsupported phi\n");
1710                 }
1711
1712               continue;
1713             }
1714
1715           gcc_assert (stmt_info);
1716
1717           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1718                || STMT_VINFO_LIVE_P (stmt_info))
1719               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1720             /* A scalar-dependence cycle that we don't support.  */
1721             return opt_result::failure_at (phi,
1722                                            "not vectorized:"
1723                                            " scalar dependence cycle.\n");
1724
1725           if (STMT_VINFO_RELEVANT_P (stmt_info))
1726             {
1727               need_to_vectorize = true;
1728               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729                   && ! PURE_SLP_STMT (stmt_info))
1730                 ok = vectorizable_induction (loop_vinfo,
1731                                              stmt_info, NULL, NULL,
1732                                              &cost_vec);
1733               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1734                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1735                             == vect_double_reduction_def)
1736                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1737                        && ! PURE_SLP_STMT (stmt_info))
1738                 ok = vectorizable_reduction (loop_vinfo,
1739                                              stmt_info, NULL, NULL, &cost_vec);
1740             }
1741
1742           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1743           if (ok
1744               && STMT_VINFO_LIVE_P (stmt_info)
1745               && !PURE_SLP_STMT (stmt_info))
1746             ok = vectorizable_live_operation (loop_vinfo,
1747                                               stmt_info, NULL, NULL, NULL,
1748                                               -1, false, &cost_vec);
1749
1750           if (!ok)
1751             return opt_result::failure_at (phi,
1752                                            "not vectorized: relevant phi not "
1753                                            "supported: %G",
1754                                            static_cast <gimple *> (phi));
1755         }
1756
1757       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1758            gsi_next (&si))
1759         {
1760           gimple *stmt = gsi_stmt (si);
1761           if (!gimple_clobber_p (stmt)
1762               && !is_gimple_debug (stmt))
1763             {
1764               opt_result res
1765                 = vect_analyze_stmt (loop_vinfo,
1766                                      loop_vinfo->lookup_stmt (stmt),
1767                                      &need_to_vectorize,
1768                                      NULL, NULL, &cost_vec);
1769               if (!res)
1770                 return res;
1771             }
1772         }
1773     } /* bbs */
1774
1775   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1776
1777   /* All operations in the loop are either irrelevant (deal with loop
1778      control, or dead), or only used outside the loop and can be moved
1779      out of the loop (e.g. invariants, inductions).  The loop can be
1780      optimized away by scalar optimizations.  We're better off not
1781      touching this loop.  */
1782   if (!need_to_vectorize)
1783     {
1784       if (dump_enabled_p ())
1785         dump_printf_loc (MSG_NOTE, vect_location,
1786                          "All the computation can be taken out of the loop.\n");
1787       return opt_result::failure_at
1788         (vect_location,
1789          "not vectorized: redundant loop. no profit to vectorize.\n");
1790     }
1791
1792   return opt_result::success ();
1793 }
1794
1795 /* Return true if we know that the iteration count is smaller than the
1796    vectorization factor.  Return false if it isn't, or if we can't be sure
1797    either way.  */
1798
1799 static bool
1800 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1801 {
1802   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1803
1804   HOST_WIDE_INT max_niter;
1805   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1806     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1807   else
1808     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1809
1810   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1811     return true;
1812
1813   return false;
1814 }
1815
1816 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1817    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1818    definitely no, or -1 if it's worth retrying.  */
1819
1820 static int
1821 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1822 {
1823   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1824   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1825
1826   /* Only loops that can handle partially-populated vectors can have iteration
1827      counts less than the vectorization factor.  */
1828   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1829     {
1830       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1831         {
1832           if (dump_enabled_p ())
1833             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834                              "not vectorized: iteration count smaller than "
1835                              "vectorization factor.\n");
1836           return 0;
1837         }
1838     }
1839
1840   /* If using the "very cheap" model. reject cases in which we'd keep
1841      a copy of the scalar code (even if we might be able to vectorize it).  */
1842   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1843       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1844           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1845           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1846     {
1847       if (dump_enabled_p ())
1848         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849                          "some scalar iterations would need to be peeled\n");
1850       return 0;
1851     }
1852
1853   int min_profitable_iters, min_profitable_estimate;
1854   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1855                                       &min_profitable_estimate);
1856
1857   if (min_profitable_iters < 0)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "not vectorized: vectorization not profitable.\n");
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "not vectorized: vector version will never be "
1865                          "profitable.\n");
1866       return -1;
1867     }
1868
1869   int min_scalar_loop_bound = (param_min_vect_loop_bound
1870                                * assumed_vf);
1871
1872   /* Use the cost model only if it is more conservative than user specified
1873      threshold.  */
1874   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1875                                     min_profitable_iters);
1876
1877   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1878
1879   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1880       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884                          "not vectorized: vectorization not profitable.\n");
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_NOTE, vect_location,
1887                          "not vectorized: iteration count smaller than user "
1888                          "specified loop bound parameter or minimum profitable "
1889                          "iterations (whichever is more conservative).\n");
1890       return 0;
1891     }
1892
1893   /* The static profitablity threshold min_profitable_estimate includes
1894      the cost of having to check at runtime whether the scalar loop
1895      should be used instead.  If it turns out that we don't need or want
1896      such a check, the threshold we should use for the static estimate
1897      is simply the point at which the vector loop becomes more profitable
1898      than the scalar loop.  */
1899   if (min_profitable_estimate > min_profitable_iters
1900       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1901       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1902       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1903       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1907                          " choice between the scalar and vector loops\n");
1908       min_profitable_estimate = min_profitable_iters;
1909     }
1910
1911   /* If the vector loop needs multiple iterations to be beneficial then
1912      things are probably too close to call, and the conservative thing
1913      would be to stick with the scalar code.  */
1914   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1915       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1916     {
1917       if (dump_enabled_p ())
1918         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919                          "one iteration of the vector loop would be"
1920                          " more expensive than the equivalent number of"
1921                          " iterations of the scalar loop\n");
1922       return 0;
1923     }
1924
1925   HOST_WIDE_INT estimated_niter;
1926
1927   /* If we are vectorizing an epilogue then we know the maximum number of
1928      scalar iterations it will cover is at least one lower than the
1929      vectorization factor of the main loop.  */
1930   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1931     estimated_niter
1932       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1933   else
1934     {
1935       estimated_niter = estimated_stmt_executions_int (loop);
1936       if (estimated_niter == -1)
1937         estimated_niter = likely_max_stmt_executions_int (loop);
1938     }
1939   if (estimated_niter != -1
1940       && ((unsigned HOST_WIDE_INT) estimated_niter
1941           < MAX (th, (unsigned) min_profitable_estimate)))
1942     {
1943       if (dump_enabled_p ())
1944         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945                          "not vectorized: estimated iteration count too "
1946                          "small.\n");
1947       if (dump_enabled_p ())
1948         dump_printf_loc (MSG_NOTE, vect_location,
1949                          "not vectorized: estimated iteration count smaller "
1950                          "than specified loop bound parameter or minimum "
1951                          "profitable iterations (whichever is more "
1952                          "conservative).\n");
1953       return -1;
1954     }
1955
1956   return 1;
1957 }
1958
1959 static opt_result
1960 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1961                            vec<data_reference_p> *datarefs,
1962                            unsigned int *n_stmts)
1963 {
1964   *n_stmts = 0;
1965   for (unsigned i = 0; i < loop->num_nodes; i++)
1966     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1967          !gsi_end_p (gsi); gsi_next (&gsi))
1968       {
1969         gimple *stmt = gsi_stmt (gsi);
1970         if (is_gimple_debug (stmt))
1971           continue;
1972         ++(*n_stmts);
1973         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1974                                                         NULL, 0);
1975         if (!res)
1976           {
1977             if (is_gimple_call (stmt) && loop->safelen)
1978               {
1979                 tree fndecl = gimple_call_fndecl (stmt), op;
1980                 if (fndecl != NULL_TREE)
1981                   {
1982                     cgraph_node *node = cgraph_node::get (fndecl);
1983                     if (node != NULL && node->simd_clones != NULL)
1984                       {
1985                         unsigned int j, n = gimple_call_num_args (stmt);
1986                         for (j = 0; j < n; j++)
1987                           {
1988                             op = gimple_call_arg (stmt, j);
1989                             if (DECL_P (op)
1990                                 || (REFERENCE_CLASS_P (op)
1991                                     && get_base_address (op)))
1992                               break;
1993                           }
1994                         op = gimple_call_lhs (stmt);
1995                         /* Ignore #pragma omp declare simd functions
1996                            if they don't have data references in the
1997                            call stmt itself.  */
1998                         if (j == n
1999                             && !(op
2000                                  && (DECL_P (op)
2001                                      || (REFERENCE_CLASS_P (op)
2002                                          && get_base_address (op)))))
2003                           continue;
2004                       }
2005                   }
2006               }
2007             return res;
2008           }
2009         /* If dependence analysis will give up due to the limit on the
2010            number of datarefs stop here and fail fatally.  */
2011         if (datarefs->length ()
2012             > (unsigned)param_loop_max_datarefs_for_datadeps)
2013           return opt_result::failure_at (stmt, "exceeded param "
2014                                          "loop-max-datarefs-for-datadeps\n");
2015       }
2016   return opt_result::success ();
2017 }
2018
2019 /* Look for SLP-only access groups and turn each individual access into its own
2020    group.  */
2021 static void
2022 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2023 {
2024   unsigned int i;
2025   struct data_reference *dr;
2026
2027   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2028
2029   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2030   FOR_EACH_VEC_ELT (datarefs, i, dr)
2031     {
2032       gcc_assert (DR_REF (dr));
2033       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2034
2035       /* Check if the load is a part of an interleaving chain.  */
2036       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037         {
2038           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2039           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2040           unsigned int group_size = DR_GROUP_SIZE (first_element);
2041
2042           /* Check if SLP-only groups.  */
2043           if (!STMT_SLP_TYPE (stmt_info)
2044               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2045             {
2046               /* Dissolve the group.  */
2047               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2048
2049               stmt_vec_info vinfo = first_element;
2050               while (vinfo)
2051                 {
2052                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2053                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2054                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2055                   DR_GROUP_SIZE (vinfo) = 1;
2056                   if (STMT_VINFO_STRIDED_P (first_element))
2057                     DR_GROUP_GAP (vinfo) = 0;
2058                   else
2059                     DR_GROUP_GAP (vinfo) = group_size - 1;
2060                   /* Duplicate and adjust alignment info, it needs to
2061                      be present on each group leader, see dr_misalignment.  */
2062                   if (vinfo != first_element)
2063                     {
2064                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2065                       dr_info2->target_alignment = dr_info->target_alignment;
2066                       int misalignment = dr_info->misalignment;
2067                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2068                         {
2069                           HOST_WIDE_INT diff
2070                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2071                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2072                           unsigned HOST_WIDE_INT align_c
2073                             = dr_info->target_alignment.to_constant ();
2074                           misalignment = (misalignment + diff) % align_c;
2075                         }
2076                       dr_info2->misalignment = misalignment;
2077                     }
2078                   vinfo = next;
2079                 }
2080             }
2081         }
2082     }
2083 }
2084
2085 /* Determine if operating on full vectors for LOOP_VINFO might leave
2086    some scalar iterations still to do.  If so, decide how we should
2087    handle those scalar iterations.  The possibilities are:
2088
2089    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2090        In this case:
2091
2092          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2093          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2094          LOOP_VINFO_PEELING_FOR_NITER == false
2095
2096    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2097        to handle the remaining scalar iterations.  In this case:
2098
2099          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2100          LOOP_VINFO_PEELING_FOR_NITER == true
2101
2102        There are two choices:
2103
2104        (2a) Consider vectorizing the epilogue loop at the same VF as the
2105             main loop, but using partial vectors instead of full vectors.
2106             In this case:
2107
2108               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2109
2110        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2111             In this case:
2112
2113               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2114
2115    When FOR_EPILOGUE_P is true, make this determination based on the
2116    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2117    based on the assumption that LOOP_VINFO is the main loop.  The caller
2118    has made sure that the number of iterations is set appropriately for
2119    this value of FOR_EPILOGUE_P.  */
2120
2121 opt_result
2122 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2123                                             bool for_epilogue_p)
2124 {
2125   /* Determine whether there would be any scalar iterations left over.  */
2126   bool need_peeling_or_partial_vectors_p
2127     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2128
2129   /* Decide whether to vectorize the loop with partial vectors.  */
2130   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2131   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2132   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2133       && need_peeling_or_partial_vectors_p)
2134     {
2135       /* For partial-vector-usage=1, try to push the handling of partial
2136          vectors to the epilogue, with the main loop continuing to operate
2137          on full vectors.
2138
2139          ??? We could then end up failing to use partial vectors if we
2140          decide to peel iterations into a prologue, and if the main loop
2141          then ends up processing fewer than VF iterations.  */
2142       if (param_vect_partial_vector_usage == 1
2143           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2144           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2145         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2146       else
2147         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2148     }
2149
2150   if (dump_enabled_p ())
2151     {
2152       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2153         dump_printf_loc (MSG_NOTE, vect_location,
2154                          "operating on partial vectors%s.\n",
2155                          for_epilogue_p ? " for epilogue loop" : "");
2156       else
2157         dump_printf_loc (MSG_NOTE, vect_location,
2158                          "operating only on full vectors%s.\n",
2159                          for_epilogue_p ? " for epilogue loop" : "");
2160     }
2161
2162   if (for_epilogue_p)
2163     {
2164       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165       gcc_assert (orig_loop_vinfo);
2166       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2167         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2168                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2169     }
2170
2171   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2172       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2173     {
2174       /* Check that the loop processes at least one full vector.  */
2175       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2176       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2177       if (known_lt (wi::to_widest (scalar_niters), vf))
2178         return opt_result::failure_at (vect_location,
2179                                        "loop does not have enough iterations"
2180                                        " to support vectorization.\n");
2181
2182       /* If we need to peel an extra epilogue iteration to handle data
2183          accesses with gaps, check that there are enough scalar iterations
2184          available.
2185
2186          The check above is redundant with this one when peeling for gaps,
2187          but the distinction is useful for diagnostics.  */
2188       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2189       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2190           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2191         return opt_result::failure_at (vect_location,
2192                                        "loop does not have enough iterations"
2193                                        " to support peeling for gaps.\n");
2194     }
2195
2196   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2197     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2198        && need_peeling_or_partial_vectors_p);
2199
2200   return opt_result::success ();
2201 }
2202
2203 /* Function vect_analyze_loop_2.
2204
2205    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2206    for it.  The different analyses will record information in the
2207    loop_vec_info struct.  */
2208 static opt_result
2209 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2210 {
2211   opt_result ok = opt_result::success ();
2212   int res;
2213   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2214   poly_uint64 min_vf = 2;
2215   loop_vec_info orig_loop_vinfo = NULL;
2216
2217   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2218      loop_vec_info of the first vectorized loop.  */
2219   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2220     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2221   else
2222     orig_loop_vinfo = loop_vinfo;
2223   gcc_assert (orig_loop_vinfo);
2224
2225   /* The first group of checks is independent of the vector size.  */
2226   fatal = true;
2227
2228   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2229       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2230     return opt_result::failure_at (vect_location,
2231                                    "not vectorized: simd if(0)\n");
2232
2233   /* Find all data references in the loop (which correspond to vdefs/vuses)
2234      and analyze their evolution in the loop.  */
2235
2236   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2237
2238   /* Gather the data references and count stmts in the loop.  */
2239   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2240     {
2241       opt_result res
2242         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2243                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2244                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2245       if (!res)
2246         {
2247           if (dump_enabled_p ())
2248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                              "not vectorized: loop contains function "
2250                              "calls or data references that cannot "
2251                              "be analyzed\n");
2252           return res;
2253         }
2254       loop_vinfo->shared->save_datarefs ();
2255     }
2256   else
2257     loop_vinfo->shared->check_datarefs ();
2258
2259   /* Analyze the data references and also adjust the minimal
2260      vectorization factor according to the loads and stores.  */
2261
2262   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2263   if (!ok)
2264     {
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267                          "bad data references.\n");
2268       return ok;
2269     }
2270
2271   /* Classify all cross-iteration scalar data-flow cycles.
2272      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2273   vect_analyze_scalar_cycles (loop_vinfo);
2274
2275   vect_pattern_recog (loop_vinfo);
2276
2277   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2278
2279   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2280      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2281
2282   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2283   if (!ok)
2284     {
2285       if (dump_enabled_p ())
2286         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287                          "bad data access.\n");
2288       return ok;
2289     }
2290
2291   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2292
2293   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2294   if (!ok)
2295     {
2296       if (dump_enabled_p ())
2297         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                          "unexpected pattern.\n");
2299       return ok;
2300     }
2301
2302   /* While the rest of the analysis below depends on it in some way.  */
2303   fatal = false;
2304
2305   /* Analyze data dependences between the data-refs in the loop
2306      and adjust the maximum vectorization factor according to
2307      the dependences.
2308      FORNOW: fail at the first data dependence that we encounter.  */
2309
2310   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2311   if (!ok)
2312     {
2313       if (dump_enabled_p ())
2314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315                          "bad data dependence.\n");
2316       return ok;
2317     }
2318   if (max_vf != MAX_VECTORIZATION_FACTOR
2319       && maybe_lt (max_vf, min_vf))
2320     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2321   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2322
2323   ok = vect_determine_vectorization_factor (loop_vinfo);
2324   if (!ok)
2325     {
2326       if (dump_enabled_p ())
2327         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328                          "can't determine vectorization factor.\n");
2329       return ok;
2330     }
2331   if (max_vf != MAX_VECTORIZATION_FACTOR
2332       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2333     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2334
2335   /* Compute the scalar iteration cost.  */
2336   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2337
2338   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2339
2340   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2341   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2342   if (!ok)
2343     return ok;
2344
2345   /* If there are any SLP instances mark them as pure_slp.  */
2346   bool slp = vect_make_slp_decision (loop_vinfo);
2347   if (slp)
2348     {
2349       /* Find stmts that need to be both vectorized and SLPed.  */
2350       vect_detect_hybrid_slp (loop_vinfo);
2351
2352       /* Update the vectorization factor based on the SLP decision.  */
2353       vect_update_vf_for_slp (loop_vinfo);
2354
2355       /* Optimize the SLP graph with the vectorization factor fixed.  */
2356       vect_optimize_slp (loop_vinfo);
2357
2358       /* Gather the loads reachable from the SLP graph entries.  */
2359       vect_gather_slp_loads (loop_vinfo);
2360     }
2361
2362   bool saved_can_use_partial_vectors_p
2363     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2364
2365   /* We don't expect to have to roll back to anything other than an empty
2366      set of rgroups.  */
2367   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2368
2369   /* This is the point where we can re-start analysis with SLP forced off.  */
2370 start_over:
2371
2372   /* Now the vectorization factor is final.  */
2373   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2374   gcc_assert (known_ne (vectorization_factor, 0U));
2375
2376   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2377     {
2378       dump_printf_loc (MSG_NOTE, vect_location,
2379                        "vectorization_factor = ");
2380       dump_dec (MSG_NOTE, vectorization_factor);
2381       dump_printf (MSG_NOTE, ", niters = %wd\n",
2382                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2383     }
2384
2385   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = init_cost (loop_vinfo, false);
2386
2387   /* Analyze the alignment of the data-refs in the loop.
2388      Fail if a data reference is found that cannot be vectorized.  */
2389
2390   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2391   if (!ok)
2392     {
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395                          "bad data alignment.\n");
2396       return ok;
2397     }
2398
2399   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2400      It is important to call pruning after vect_analyze_data_ref_accesses,
2401      since we use grouping information gathered by interleaving analysis.  */
2402   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2403   if (!ok)
2404     return ok;
2405
2406   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2407      vectorization, since we do not want to add extra peeling or
2408      add versioning for alignment.  */
2409   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2410     /* This pass will decide on using loop versioning and/or loop peeling in
2411        order to enhance the alignment of data references in the loop.  */
2412     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2413   if (!ok)
2414     return ok;
2415
2416   if (slp)
2417     {
2418       /* Analyze operations in the SLP instances.  Note this may
2419          remove unsupported SLP instances which makes the above
2420          SLP kind detection invalid.  */
2421       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2422       vect_slp_analyze_operations (loop_vinfo);
2423       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2424         {
2425           ok = opt_result::failure_at (vect_location,
2426                                        "unsupported SLP instances\n");
2427           goto again;
2428         }
2429
2430       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2431       slp_tree load_node, slp_root;
2432       unsigned i, x;
2433       slp_instance instance;
2434       bool can_use_lanes = true;
2435       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2436         {
2437           slp_root = SLP_INSTANCE_TREE (instance);
2438           int group_size = SLP_TREE_LANES (slp_root);
2439           tree vectype = SLP_TREE_VECTYPE (slp_root);
2440           bool loads_permuted = false;
2441           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2442             {
2443               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2444                 continue;
2445               unsigned j;
2446               stmt_vec_info load_info;
2447               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2448                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2449                   {
2450                     loads_permuted = true;
2451                     break;
2452                   }
2453             }
2454
2455           /* If the loads and stores can be handled with load/store-lane
2456              instructions record it and move on to the next instance.  */
2457           if (loads_permuted
2458               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2459               && vect_store_lanes_supported (vectype, group_size, false))
2460             {
2461               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2462                 {
2463                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2464                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2465                   /* Use SLP for strided accesses (or if we can't
2466                      load-lanes).  */
2467                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2468                       || ! vect_load_lanes_supported
2469                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2470                              DR_GROUP_SIZE (stmt_vinfo), false))
2471                     break;
2472                 }
2473
2474               can_use_lanes
2475                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2476
2477               if (can_use_lanes && dump_enabled_p ())
2478                 dump_printf_loc (MSG_NOTE, vect_location,
2479                                  "SLP instance %p can use load/store-lanes\n",
2480                                  instance);
2481             }
2482           else
2483             {
2484               can_use_lanes = false;
2485               break;
2486             }
2487         }
2488
2489       /* If all SLP instances can use load/store-lanes abort SLP and try again
2490          with SLP disabled.  */
2491       if (can_use_lanes)
2492         {
2493           ok = opt_result::failure_at (vect_location,
2494                                        "Built SLP cancelled: can use "
2495                                        "load/store-lanes\n");
2496           if (dump_enabled_p ())
2497             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2498                              "Built SLP cancelled: all SLP instances support "
2499                              "load/store-lanes\n");
2500           goto again;
2501         }
2502     }
2503
2504   /* Dissolve SLP-only groups.  */
2505   vect_dissolve_slp_only_groups (loop_vinfo);
2506
2507   /* Scan all the remaining operations in the loop that are not subject
2508      to SLP and make sure they are vectorizable.  */
2509   ok = vect_analyze_loop_operations (loop_vinfo);
2510   if (!ok)
2511     {
2512       if (dump_enabled_p ())
2513         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514                          "bad operation or unsupported loop bound.\n");
2515       return ok;
2516     }
2517
2518   /* For now, we don't expect to mix both masking and length approaches for one
2519      loop, disable it if both are recorded.  */
2520   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2521       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2522       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2523     {
2524       if (dump_enabled_p ())
2525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526                          "can't vectorize a loop with partial vectors"
2527                          " because we don't expect to mix different"
2528                          " approaches with partial vectors for the"
2529                          " same loop.\n");
2530       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2531     }
2532
2533   /* If we still have the option of using partial vectors,
2534      check whether we can generate the necessary loop controls.  */
2535   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2536       && !vect_verify_full_masking (loop_vinfo)
2537       && !vect_verify_loop_lens (loop_vinfo))
2538     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2539
2540   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2541      to be able to handle fewer than VF scalars, or needs to have a lower VF
2542      than the main loop.  */
2543   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2544       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2545       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2546                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2547     return opt_result::failure_at (vect_location,
2548                                    "Vectorization factor too high for"
2549                                    " epilogue loop.\n");
2550
2551   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2552      assuming that the loop will be used as a main loop.  We will redo
2553      this analysis later if we instead decide to use the loop as an
2554      epilogue loop.  */
2555   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2556   if (!ok)
2557     return ok;
2558
2559   /* Check the costings of the loop make vectorizing worthwhile.  */
2560   res = vect_analyze_loop_costing (loop_vinfo);
2561   if (res < 0)
2562     {
2563       ok = opt_result::failure_at (vect_location,
2564                                    "Loop costings may not be worthwhile.\n");
2565       goto again;
2566     }
2567   if (!res)
2568     return opt_result::failure_at (vect_location,
2569                                    "Loop costings not worthwhile.\n");
2570
2571   /* If an epilogue loop is required make sure we can create one.  */
2572   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2573       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2574     {
2575       if (dump_enabled_p ())
2576         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2577       if (!vect_can_advance_ivs_p (loop_vinfo)
2578           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2579                                            single_exit (LOOP_VINFO_LOOP
2580                                                          (loop_vinfo))))
2581         {
2582           ok = opt_result::failure_at (vect_location,
2583                                        "not vectorized: can't create required "
2584                                        "epilog loop\n");
2585           goto again;
2586         }
2587     }
2588
2589   /* During peeling, we need to check if number of loop iterations is
2590      enough for both peeled prolog loop and vector loop.  This check
2591      can be merged along with threshold check of loop versioning, so
2592      increase threshold for this case if necessary.
2593
2594      If we are analyzing an epilogue we still want to check what its
2595      versioning threshold would be.  If we decide to vectorize the epilogues we
2596      will want to use the lowest versioning threshold of all epilogues and main
2597      loop.  This will enable us to enter a vectorized epilogue even when
2598      versioning the loop.  We can't simply check whether the epilogue requires
2599      versioning though since we may have skipped some versioning checks when
2600      analyzing the epilogue.  For instance, checks for alias versioning will be
2601      skipped when dealing with epilogues as we assume we already checked them
2602      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2603   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2604     {
2605       poly_uint64 niters_th = 0;
2606       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2607
2608       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2609         {
2610           /* Niters for peeled prolog loop.  */
2611           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2612             {
2613               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2614               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2615               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2616             }
2617           else
2618             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2619         }
2620
2621       /* Niters for at least one iteration of vectorized loop.  */
2622       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2623         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2624       /* One additional iteration because of peeling for gap.  */
2625       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2626         niters_th += 1;
2627
2628       /*  Use the same condition as vect_transform_loop to decide when to use
2629           the cost to determine a versioning threshold.  */
2630       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2631           && ordered_p (th, niters_th))
2632         niters_th = ordered_max (poly_uint64 (th), niters_th);
2633
2634       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2635     }
2636
2637   gcc_assert (known_eq (vectorization_factor,
2638                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2639
2640   /* Ok to vectorize!  */
2641   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2642   return opt_result::success ();
2643
2644 again:
2645   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2646   gcc_assert (!ok);
2647
2648   /* Try again with SLP forced off but if we didn't do any SLP there is
2649      no point in re-trying.  */
2650   if (!slp)
2651     return ok;
2652
2653   /* If there are reduction chains re-trying will fail anyway.  */
2654   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2655     return ok;
2656
2657   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2658      via interleaving or lane instructions.  */
2659   slp_instance instance;
2660   slp_tree node;
2661   unsigned i, j;
2662   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2663     {
2664       stmt_vec_info vinfo;
2665       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2666       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2667         continue;
2668       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2669       unsigned int size = DR_GROUP_SIZE (vinfo);
2670       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2671       if (! vect_store_lanes_supported (vectype, size, false)
2672          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2673          && ! vect_grouped_store_supported (vectype, size))
2674         return opt_result::failure_at (vinfo->stmt,
2675                                        "unsupported grouped store\n");
2676       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2677         {
2678           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2679           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2680           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2681           size = DR_GROUP_SIZE (vinfo);
2682           vectype = STMT_VINFO_VECTYPE (vinfo);
2683           if (! vect_load_lanes_supported (vectype, size, false)
2684               && ! vect_grouped_load_supported (vectype, single_element_p,
2685                                                 size))
2686             return opt_result::failure_at (vinfo->stmt,
2687                                            "unsupported grouped load\n");
2688         }
2689     }
2690
2691   if (dump_enabled_p ())
2692     dump_printf_loc (MSG_NOTE, vect_location,
2693                      "re-trying with SLP disabled\n");
2694
2695   /* Roll back state appropriately.  No SLP this time.  */
2696   slp = false;
2697   /* Restore vectorization factor as it were without SLP.  */
2698   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2699   /* Free the SLP instances.  */
2700   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2701     vect_free_slp_instance (instance);
2702   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2703   /* Reset SLP type to loop_vect on all stmts.  */
2704   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2705     {
2706       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2707       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2708            !gsi_end_p (si); gsi_next (&si))
2709         {
2710           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2711           STMT_SLP_TYPE (stmt_info) = loop_vect;
2712           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2713               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2714             {
2715               /* vectorizable_reduction adjusts reduction stmt def-types,
2716                  restore them to that of the PHI.  */
2717               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2718                 = STMT_VINFO_DEF_TYPE (stmt_info);
2719               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2720                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2721                 = STMT_VINFO_DEF_TYPE (stmt_info);
2722             }
2723         }
2724       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2725            !gsi_end_p (si); gsi_next (&si))
2726         {
2727           if (is_gimple_debug (gsi_stmt (si)))
2728             continue;
2729           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2730           STMT_SLP_TYPE (stmt_info) = loop_vect;
2731           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2732             {
2733               stmt_vec_info pattern_stmt_info
2734                 = STMT_VINFO_RELATED_STMT (stmt_info);
2735               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2736                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2737
2738               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2739               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2740               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2741                    !gsi_end_p (pi); gsi_next (&pi))
2742                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2743                   = loop_vect;
2744             }
2745         }
2746     }
2747   /* Free optimized alias test DDRS.  */
2748   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2749   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2750   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2751   /* Reset target cost data.  */
2752   delete LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2753   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = nullptr;
2754   /* Reset accumulated rgroup information.  */
2755   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2756   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2757   /* Reset assorted flags.  */
2758   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2759   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2760   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2761   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2762   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2763     = saved_can_use_partial_vectors_p;
2764
2765   goto start_over;
2766 }
2767
2768 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2769    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2770    OLD_LOOP_VINFO is better unless something specifically indicates
2771    otherwise.
2772
2773    Note that this deliberately isn't a partial order.  */
2774
2775 static bool
2776 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2777                           loop_vec_info old_loop_vinfo)
2778 {
2779   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2780   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2781
2782   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2783   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2784
2785   /* Always prefer a VF of loop->simdlen over any other VF.  */
2786   if (loop->simdlen)
2787     {
2788       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2789       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2790       if (new_simdlen_p != old_simdlen_p)
2791         return new_simdlen_p;
2792     }
2793
2794   /* Limit the VFs to what is likely to be the maximum number of iterations,
2795      to handle cases in which at least one loop_vinfo is fully-masked.  */
2796   HOST_WIDE_INT estimated_max_niter;
2797   loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2798   unsigned HOST_WIDE_INT main_vf;
2799   if (main_loop
2800       && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2801       && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2802     estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2803   else
2804     estimated_max_niter = likely_max_stmt_executions_int (loop);
2805   if (estimated_max_niter != -1)
2806     {
2807       if (known_le (estimated_max_niter, new_vf))
2808         new_vf = estimated_max_niter;
2809       if (known_le (estimated_max_niter, old_vf))
2810         old_vf = estimated_max_niter;
2811     }
2812
2813   /* Check whether the (fractional) cost per scalar iteration is lower
2814      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2815   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2816   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2817
2818   HOST_WIDE_INT est_rel_new_min
2819     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2820   HOST_WIDE_INT est_rel_new_max
2821     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2822
2823   HOST_WIDE_INT est_rel_old_min
2824     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2825   HOST_WIDE_INT est_rel_old_max
2826     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2827
2828   /* Check first if we can make out an unambigous total order from the minimum
2829      and maximum estimates.  */
2830   if (est_rel_new_min < est_rel_old_min
2831       && est_rel_new_max < est_rel_old_max)
2832     return true;
2833   else if (est_rel_old_min < est_rel_new_min
2834            && est_rel_old_max < est_rel_new_max)
2835     return false;
2836   /* When old_loop_vinfo uses a variable vectorization factor,
2837      we know that it has a lower cost for at least one runtime VF.
2838      However, we don't know how likely that VF is.
2839
2840      One option would be to compare the costs for the estimated VFs.
2841      The problem is that that can put too much pressure on the cost
2842      model.  E.g. if the estimated VF is also the lowest possible VF,
2843      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2844      for the estimated VF, we'd then choose new_loop_vinfo even
2845      though (a) new_loop_vinfo might not actually be better than
2846      old_loop_vinfo for that VF and (b) it would be significantly
2847      worse at larger VFs.
2848
2849      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2850      no more expensive than old_loop_vinfo even after doubling the
2851      estimated old_loop_vinfo VF.  For all but trivial loops, this
2852      ensures that we only pick new_loop_vinfo if it is significantly
2853      better than old_loop_vinfo at the estimated VF.  */
2854
2855   if (est_rel_old_min != est_rel_new_min
2856       || est_rel_old_max != est_rel_new_max)
2857     {
2858       HOST_WIDE_INT est_rel_new_likely
2859         = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2860       HOST_WIDE_INT est_rel_old_likely
2861         = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2862
2863       return est_rel_new_likely * 2 <= est_rel_old_likely;
2864     }
2865
2866   /* If there's nothing to choose between the loop bodies, see whether
2867      there's a difference in the prologue and epilogue costs.  */
2868   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2869     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2870
2871   return false;
2872 }
2873
2874 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2875    true if we should.  */
2876
2877 static bool
2878 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2879                         loop_vec_info old_loop_vinfo)
2880 {
2881   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2882     return false;
2883
2884   if (dump_enabled_p ())
2885     dump_printf_loc (MSG_NOTE, vect_location,
2886                      "***** Preferring vector mode %s to vector mode %s\n",
2887                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2888                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2889   return true;
2890 }
2891
2892 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2893    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2894    MODE_I to the next mode useful to analyze.
2895    Return the loop_vinfo on success and wrapped null on failure.  */
2896
2897 static opt_loop_vec_info
2898 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2899                      const vect_loop_form_info *loop_form_info,
2900                      loop_vec_info main_loop_vinfo,
2901                      const vector_modes &vector_modes, unsigned &mode_i,
2902                      machine_mode &autodetected_vector_mode,
2903                      bool &fatal)
2904 {
2905   loop_vec_info loop_vinfo
2906     = vect_create_loop_vinfo (loop, shared, loop_form_info);
2907   if (main_loop_vinfo)
2908     LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_vinfo;
2909
2910   machine_mode vector_mode = vector_modes[mode_i];
2911   loop_vinfo->vector_mode = vector_mode;
2912
2913   /* Run the main analysis.  */
2914   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2915   if (dump_enabled_p ())
2916     dump_printf_loc (MSG_NOTE, vect_location,
2917                      "***** Analysis %s with vector mode %s\n",
2918                      res ? "succeeded" : " failed",
2919                      GET_MODE_NAME (loop_vinfo->vector_mode));
2920
2921   /* Remember the autodetected vector mode.  */
2922   if (vector_mode == VOIDmode)
2923     autodetected_vector_mode = loop_vinfo->vector_mode;
2924
2925   /* Advance mode_i, first skipping modes that would result in the
2926      same analysis result.  */
2927   while (mode_i + 1 < vector_modes.length ()
2928          && vect_chooses_same_modes_p (loop_vinfo,
2929                                        vector_modes[mode_i + 1]))
2930     {
2931       if (dump_enabled_p ())
2932         dump_printf_loc (MSG_NOTE, vect_location,
2933                          "***** The result for vector mode %s would"
2934                          " be the same\n",
2935                          GET_MODE_NAME (vector_modes[mode_i + 1]));
2936       mode_i += 1;
2937     }
2938   if (mode_i + 1 < vector_modes.length ()
2939       && VECTOR_MODE_P (autodetected_vector_mode)
2940       && (related_vector_mode (vector_modes[mode_i + 1],
2941                                GET_MODE_INNER (autodetected_vector_mode))
2942           == autodetected_vector_mode)
2943       && (related_vector_mode (autodetected_vector_mode,
2944                                GET_MODE_INNER (vector_modes[mode_i + 1]))
2945           == vector_modes[mode_i + 1]))
2946     {
2947       if (dump_enabled_p ())
2948         dump_printf_loc (MSG_NOTE, vect_location,
2949                          "***** Skipping vector mode %s, which would"
2950                          " repeat the analysis for %s\n",
2951                          GET_MODE_NAME (vector_modes[mode_i + 1]),
2952                          GET_MODE_NAME (autodetected_vector_mode));
2953       mode_i += 1;
2954     }
2955   mode_i++;
2956
2957   if (!res)
2958     {
2959       delete loop_vinfo;
2960       if (fatal)
2961         gcc_checking_assert (main_loop_vinfo == NULL);
2962       return opt_loop_vec_info::propagate_failure (res);
2963     }
2964
2965   return opt_loop_vec_info::success (loop_vinfo);
2966 }
2967
2968 /* Function vect_analyze_loop.
2969
2970    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2971    for it.  The different analyses will record information in the
2972    loop_vec_info struct.  */
2973 opt_loop_vec_info
2974 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2975 {
2976   DUMP_VECT_SCOPE ("analyze_loop_nest");
2977
2978   if (loop_outer (loop)
2979       && loop_vec_info_for_loop (loop_outer (loop))
2980       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2981     return opt_loop_vec_info::failure_at (vect_location,
2982                                           "outer-loop already vectorized.\n");
2983
2984   if (!find_loop_nest (loop, &shared->loop_nest))
2985     return opt_loop_vec_info::failure_at
2986       (vect_location,
2987        "not vectorized: loop nest containing two or more consecutive inner"
2988        " loops cannot be vectorized\n");
2989
2990   /* Analyze the loop form.  */
2991   vect_loop_form_info loop_form_info;
2992   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2993   if (!res)
2994     {
2995       if (dump_enabled_p ())
2996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997                          "bad loop form.\n");
2998       return opt_loop_vec_info::propagate_failure (res);
2999     }
3000
3001   /* When pick_lowest_cost_p is true, we should in principle iterate
3002      over all the loop_vec_infos that LOOP_VINFO could replace and
3003      try to vectorize LOOP_VINFO under the same conditions.
3004      E.g. when trying to replace an epilogue loop, we should vectorize
3005      LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
3006      to replace the main loop, we should vectorize LOOP_VINFO as a main
3007      loop too.
3008
3009      However, autovectorize_vector_modes is usually sorted as follows:
3010
3011      - Modes that naturally produce lower VFs usually follow modes that
3012      naturally produce higher VFs.
3013
3014      - When modes naturally produce the same VF, maskable modes
3015      usually follow unmaskable ones, so that the maskable mode
3016      can be used to vectorize the epilogue of the unmaskable mode.
3017
3018      This order is preferred because it leads to the maximum
3019      epilogue vectorization opportunities.  Targets should only use
3020      a different order if they want to make wide modes available while
3021      disparaging them relative to earlier, smaller modes.  The assumption
3022      in that case is that the wider modes are more expensive in some
3023      way that isn't reflected directly in the costs.
3024
3025      There should therefore be few interesting cases in which
3026      LOOP_VINFO fails when treated as an epilogue loop, succeeds when
3027      treated as a standalone loop, and ends up being genuinely cheaper
3028      than FIRST_LOOP_VINFO.  */
3029
3030   auto_vector_modes vector_modes;
3031   /* Autodetect first vector size we try.  */
3032   vector_modes.safe_push (VOIDmode);
3033   unsigned int autovec_flags
3034     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3035                                                     loop->simdlen != 0);
3036   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3037                              && !unlimited_cost_model (loop));
3038   machine_mode autodetected_vector_mode = VOIDmode;
3039   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3040   unsigned int mode_i = 0;
3041   unsigned int first_loop_i = 0;
3042   unsigned int first_loop_next_i = 0;
3043   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3044
3045   /* First determine the main loop vectorization mode.  */
3046   while (1)
3047     {
3048       unsigned int loop_vinfo_i = mode_i;
3049       bool fatal;
3050       opt_loop_vec_info loop_vinfo
3051         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3052                                NULL, vector_modes, mode_i,
3053                                autodetected_vector_mode, fatal);
3054       if (fatal)
3055         break;
3056
3057       if (loop_vinfo)
3058         {
3059           /* Once we hit the desired simdlen for the first time,
3060              discard any previous attempts.  */
3061           if (simdlen
3062               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3063             {
3064               delete first_loop_vinfo;
3065               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3066               simdlen = 0;
3067             }
3068           else if (pick_lowest_cost_p && first_loop_vinfo)
3069             {
3070               /* Keep trying to roll back vectorization attempts while the
3071                  loop_vec_infos they produced were worse than this one.  */
3072               if (vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3073                 {
3074                   delete first_loop_vinfo;
3075                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
3076                 }
3077             }
3078           if (first_loop_vinfo == NULL)
3079             {
3080               first_loop_vinfo = loop_vinfo;
3081               first_loop_i = loop_vinfo_i;
3082               first_loop_next_i = mode_i;
3083             }
3084           else
3085             {
3086               delete loop_vinfo;
3087               loop_vinfo = opt_loop_vec_info::success (NULL);
3088             }
3089
3090           /* Commit to first_loop_vinfo if we have no reason to try
3091              alternatives.  */
3092           if (!simdlen && !pick_lowest_cost_p)
3093             break;
3094         }
3095       if (mode_i == vector_modes.length ()
3096           || autodetected_vector_mode == VOIDmode)
3097         break;
3098
3099       /* Try the next biggest vector size.  */
3100       if (dump_enabled_p ())
3101         dump_printf_loc (MSG_NOTE, vect_location,
3102                          "***** Re-trying analysis with vector mode %s\n",
3103                          GET_MODE_NAME (vector_modes[mode_i]));
3104     }
3105   if (!first_loop_vinfo)
3106     return opt_loop_vec_info::propagate_failure (res);
3107
3108   if (dump_enabled_p ())
3109     dump_printf_loc (MSG_NOTE, vect_location,
3110                      "***** Choosing vector mode %s\n",
3111                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3112
3113   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3114      enabled, SIMDUID is not set, it is the innermost loop and we have
3115      either already found the loop's SIMDLEN or there was no SIMDLEN to
3116      begin with.
3117      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3118   bool vect_epilogues = (!simdlen
3119                          && loop->inner == NULL
3120                          && param_vect_epilogues_nomask
3121                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3122                          && !loop->simduid);
3123   if (!vect_epilogues)
3124     return first_loop_vinfo;
3125
3126   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3127   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3128
3129   /* Handle the case that the original loop can use partial
3130      vectorization, but want to only adopt it for the epilogue.
3131      The retry should be in the same mode as original.  */
3132   if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3133     {
3134       gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3135                   && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3136       if (dump_enabled_p ())
3137         dump_printf_loc (MSG_NOTE, vect_location,
3138                          "***** Re-trying analysis with same vector mode"
3139                          " %s for epilogue with partial vectors.\n",
3140                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3141       mode_i = first_loop_i;
3142     }
3143   else
3144     {
3145       mode_i = first_loop_next_i;
3146       if (mode_i == vector_modes.length ())
3147         return first_loop_vinfo;
3148     }
3149
3150   /* ???  If first_loop_vinfo was using VOIDmode then we probably
3151      want to instead search for the corresponding mode in vector_modes[].  */
3152
3153   while (1)
3154     {
3155       bool fatal;
3156       opt_loop_vec_info loop_vinfo
3157         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3158                                first_loop_vinfo,
3159                                vector_modes, mode_i,
3160                                autodetected_vector_mode, fatal);
3161       if (fatal)
3162         break;
3163
3164       if (loop_vinfo)
3165         {
3166           if (pick_lowest_cost_p)
3167             {
3168               /* Keep trying to roll back vectorization attempts while the
3169                  loop_vec_infos they produced were worse than this one.  */
3170               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3171               while (!vinfos.is_empty ()
3172                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3173                 {
3174                   gcc_assert (vect_epilogues);
3175                   delete vinfos.pop ();
3176                 }
3177             }
3178           /* For now only allow one epilogue loop.  */
3179           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3180             {
3181               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3182               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3183               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3184                           || maybe_ne (lowest_th, 0U));
3185               /* Keep track of the known smallest versioning
3186                  threshold.  */
3187               if (ordered_p (lowest_th, th))
3188                 lowest_th = ordered_min (lowest_th, th);
3189             }
3190           else
3191             {
3192               delete loop_vinfo;
3193               loop_vinfo = opt_loop_vec_info::success (NULL);
3194             }
3195
3196           /* For now only allow one epilogue loop, but allow
3197              pick_lowest_cost_p to replace it, so commit to the
3198              first epilogue if we have no reason to try alternatives.  */
3199           if (!pick_lowest_cost_p)
3200             break;
3201         }
3202
3203       if (mode_i == vector_modes.length ())
3204         break;
3205
3206       /* Try the next biggest vector size.  */
3207       if (dump_enabled_p ())
3208         dump_printf_loc (MSG_NOTE, vect_location,
3209                          "***** Re-trying epilogue analysis with vector "
3210                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3211     }
3212
3213   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3214     {
3215       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3216       if (dump_enabled_p ())
3217         dump_printf_loc (MSG_NOTE, vect_location,
3218                          "***** Choosing epilogue vector mode %s\n",
3219                          GET_MODE_NAME
3220                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3221     }
3222
3223   return first_loop_vinfo;
3224 }
3225
3226 /* Return true if there is an in-order reduction function for CODE, storing
3227    it in *REDUC_FN if so.  */
3228
3229 static bool
3230 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3231 {
3232   switch (code)
3233     {
3234     case PLUS_EXPR:
3235       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3236       return true;
3237
3238     default:
3239       return false;
3240     }
3241 }
3242
3243 /* Function reduction_fn_for_scalar_code
3244
3245    Input:
3246    CODE - tree_code of a reduction operations.
3247
3248    Output:
3249    REDUC_FN - the corresponding internal function to be used to reduce the
3250       vector of partial results into a single scalar result, or IFN_LAST
3251       if the operation is a supported reduction operation, but does not have
3252       such an internal function.
3253
3254    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3255
3256 bool
3257 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3258 {
3259   switch (code)
3260     {
3261       case MAX_EXPR:
3262         *reduc_fn = IFN_REDUC_MAX;
3263         return true;
3264
3265       case MIN_EXPR:
3266         *reduc_fn = IFN_REDUC_MIN;
3267         return true;
3268
3269       case PLUS_EXPR:
3270         *reduc_fn = IFN_REDUC_PLUS;
3271         return true;
3272
3273       case BIT_AND_EXPR:
3274         *reduc_fn = IFN_REDUC_AND;
3275         return true;
3276
3277       case BIT_IOR_EXPR:
3278         *reduc_fn = IFN_REDUC_IOR;
3279         return true;
3280
3281       case BIT_XOR_EXPR:
3282         *reduc_fn = IFN_REDUC_XOR;
3283         return true;
3284
3285       case MULT_EXPR:
3286       case MINUS_EXPR:
3287         *reduc_fn = IFN_LAST;
3288         return true;
3289
3290       default:
3291        return false;
3292     }
3293 }
3294
3295 /* If there is a neutral value X such that a reduction would not be affected
3296    by the introduction of additional X elements, return that X, otherwise
3297    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3298    of the scalar elements.  If the reduction has just a single initial value
3299    then INITIAL_VALUE is that value, otherwise it is null.  */
3300
3301 static tree
3302 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3303 {
3304   switch (code)
3305     {
3306     case WIDEN_SUM_EXPR:
3307     case DOT_PROD_EXPR:
3308     case SAD_EXPR:
3309     case PLUS_EXPR:
3310     case MINUS_EXPR:
3311     case BIT_IOR_EXPR:
3312     case BIT_XOR_EXPR:
3313       return build_zero_cst (scalar_type);
3314
3315     case MULT_EXPR:
3316       return build_one_cst (scalar_type);
3317
3318     case BIT_AND_EXPR:
3319       return build_all_ones_cst (scalar_type);
3320
3321     case MAX_EXPR:
3322     case MIN_EXPR:
3323       return initial_value;
3324
3325     default:
3326       return NULL_TREE;
3327     }
3328 }
3329
3330 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3331    STMT is printed with a message MSG. */
3332
3333 static void
3334 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3335 {
3336   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3337 }
3338
3339 /* Return true if we need an in-order reduction for operation CODE
3340    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3341    overflow must wrap.  */
3342
3343 bool
3344 needs_fold_left_reduction_p (tree type, tree_code code)
3345 {
3346   /* CHECKME: check for !flag_finite_math_only too?  */
3347   if (SCALAR_FLOAT_TYPE_P (type))
3348     switch (code)
3349       {
3350       case MIN_EXPR:
3351       case MAX_EXPR:
3352         return false;
3353
3354       default:
3355         return !flag_associative_math;
3356       }
3357
3358   if (INTEGRAL_TYPE_P (type))
3359     {
3360       if (!operation_no_trapping_overflow (type, code))
3361         return true;
3362       return false;
3363     }
3364
3365   if (SAT_FIXED_POINT_TYPE_P (type))
3366     return true;
3367
3368   return false;
3369 }
3370
3371 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3372    has a handled computation expression.  Store the main reduction
3373    operation in *CODE.  */
3374
3375 static bool
3376 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3377                       tree loop_arg, enum tree_code *code,
3378                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3379 {
3380   auto_bitmap visited;
3381   tree lookfor = PHI_RESULT (phi);
3382   ssa_op_iter curri;
3383   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3384   while (USE_FROM_PTR (curr) != loop_arg)
3385     curr = op_iter_next_use (&curri);
3386   curri.i = curri.numops;
3387   do
3388     {
3389       path.safe_push (std::make_pair (curri, curr));
3390       tree use = USE_FROM_PTR (curr);
3391       if (use == lookfor)
3392         break;
3393       gimple *def = SSA_NAME_DEF_STMT (use);
3394       if (gimple_nop_p (def)
3395           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3396         {
3397 pop:
3398           do
3399             {
3400               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3401               curri = x.first;
3402               curr = x.second;
3403               do
3404                 curr = op_iter_next_use (&curri);
3405               /* Skip already visited or non-SSA operands (from iterating
3406                  over PHI args).  */
3407               while (curr != NULL_USE_OPERAND_P
3408                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3409                          || ! bitmap_set_bit (visited,
3410                                               SSA_NAME_VERSION
3411                                                 (USE_FROM_PTR (curr)))));
3412             }
3413           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3414           if (curr == NULL_USE_OPERAND_P)
3415             break;
3416         }
3417       else
3418         {
3419           if (gimple_code (def) == GIMPLE_PHI)
3420             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3421           else
3422             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3423           while (curr != NULL_USE_OPERAND_P
3424                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3425                      || ! bitmap_set_bit (visited,
3426                                           SSA_NAME_VERSION
3427                                             (USE_FROM_PTR (curr)))))
3428             curr = op_iter_next_use (&curri);
3429           if (curr == NULL_USE_OPERAND_P)
3430             goto pop;
3431         }
3432     }
3433   while (1);
3434   if (dump_file && (dump_flags & TDF_DETAILS))
3435     {
3436       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3437       unsigned i;
3438       std::pair<ssa_op_iter, use_operand_p> *x;
3439       FOR_EACH_VEC_ELT (path, i, x)
3440         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3441       dump_printf (MSG_NOTE, "\n");
3442     }
3443
3444   /* Check whether the reduction path detected is valid.  */
3445   bool fail = path.length () == 0;
3446   bool neg = false;
3447   int sign = -1;
3448   *code = ERROR_MARK;
3449   for (unsigned i = 1; i < path.length (); ++i)
3450     {
3451       gimple *use_stmt = USE_STMT (path[i].second);
3452       tree op = USE_FROM_PTR (path[i].second);
3453       if (! is_gimple_assign (use_stmt)
3454           /* The following make sure we can compute the operand index
3455              easily plus it mostly disallows chaining via COND_EXPR condition
3456              operands.  */
3457           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3458               && (gimple_num_ops (use_stmt) <= 2
3459                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3460               && (gimple_num_ops (use_stmt) <= 3
3461                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3462         {
3463           fail = true;
3464           break;
3465         }
3466       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3467       if (use_code == MINUS_EXPR)
3468         {
3469           use_code = PLUS_EXPR;
3470           /* Track whether we negate the reduction value each iteration.  */
3471           if (gimple_assign_rhs2 (use_stmt) == op)
3472             neg = ! neg;
3473         }
3474       if (CONVERT_EXPR_CODE_P (use_code)
3475           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3476                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3477         ;
3478       else if (*code == ERROR_MARK)
3479         {
3480           *code = use_code;
3481           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3482         }
3483       else if (use_code != *code)
3484         {
3485           fail = true;
3486           break;
3487         }
3488       else if ((use_code == MIN_EXPR
3489                 || use_code == MAX_EXPR)
3490                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3491         {
3492           fail = true;
3493           break;
3494         }
3495       /* Check there's only a single stmt the op is used on.  For the
3496          not value-changing tail and the last stmt allow out-of-loop uses.
3497          ???  We could relax this and handle arbitrary live stmts by
3498          forcing a scalar epilogue for example.  */
3499       imm_use_iterator imm_iter;
3500       gimple *op_use_stmt;
3501       unsigned cnt = 0;
3502       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3503         if (!is_gimple_debug (op_use_stmt)
3504             && (*code != ERROR_MARK
3505                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3506           {
3507             /* We want to allow x + x but not x < 1 ? x : 2.  */
3508             if (is_gimple_assign (op_use_stmt)
3509                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3510               {
3511                 use_operand_p use_p;
3512                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3513                   cnt++;
3514               }
3515             else
3516               cnt++;
3517           }
3518       if (cnt != 1)
3519         {
3520           fail = true;
3521           break;
3522         }
3523     }
3524   return ! fail && ! neg && *code != ERROR_MARK;
3525 }
3526
3527 bool
3528 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3529                       tree loop_arg, enum tree_code code)
3530 {
3531   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3532   enum tree_code code_;
3533   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3534           && code_ == code);
3535 }
3536
3537
3538
3539 /* Function vect_is_simple_reduction
3540
3541    (1) Detect a cross-iteration def-use cycle that represents a simple
3542    reduction computation.  We look for the following pattern:
3543
3544    loop_header:
3545      a1 = phi < a0, a2 >
3546      a3 = ...
3547      a2 = operation (a3, a1)
3548
3549    or
3550
3551    a3 = ...
3552    loop_header:
3553      a1 = phi < a0, a2 >
3554      a2 = operation (a3, a1)
3555
3556    such that:
3557    1. operation is commutative and associative and it is safe to
3558       change the order of the computation
3559    2. no uses for a2 in the loop (a2 is used out of the loop)
3560    3. no uses of a1 in the loop besides the reduction operation
3561    4. no uses of a1 outside the loop.
3562
3563    Conditions 1,4 are tested here.
3564    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3565
3566    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3567    nested cycles.
3568
3569    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3570    reductions:
3571
3572      a1 = phi < a0, a2 >
3573      inner loop (def of a3)
3574      a2 = phi < a3 >
3575
3576    (4) Detect condition expressions, ie:
3577      for (int i = 0; i < N; i++)
3578        if (a[i] < val)
3579         ret_val = a[i];
3580
3581 */
3582
3583 static stmt_vec_info
3584 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3585                           bool *double_reduc, bool *reduc_chain_p)
3586 {
3587   gphi *phi = as_a <gphi *> (phi_info->stmt);
3588   gimple *phi_use_stmt = NULL;
3589   imm_use_iterator imm_iter;
3590   use_operand_p use_p;
3591
3592   *double_reduc = false;
3593   *reduc_chain_p = false;
3594   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3595
3596   tree phi_name = PHI_RESULT (phi);
3597   /* ???  If there are no uses of the PHI result the inner loop reduction
3598      won't be detected as possibly double-reduction by vectorizable_reduction
3599      because that tries to walk the PHI arg from the preheader edge which
3600      can be constant.  See PR60382.  */
3601   if (has_zero_uses (phi_name))
3602     return NULL;
3603   class loop *loop = (gimple_bb (phi))->loop_father;
3604   unsigned nphi_def_loop_uses = 0;
3605   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3606     {
3607       gimple *use_stmt = USE_STMT (use_p);
3608       if (is_gimple_debug (use_stmt))
3609         continue;
3610
3611       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3612         {
3613           if (dump_enabled_p ())
3614             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615                              "intermediate value used outside loop.\n");
3616
3617           return NULL;
3618         }
3619
3620       nphi_def_loop_uses++;
3621       phi_use_stmt = use_stmt;
3622     }
3623
3624   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3625   if (TREE_CODE (latch_def) != SSA_NAME)
3626     {
3627       if (dump_enabled_p ())
3628         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629                          "reduction: not ssa_name: %T\n", latch_def);
3630       return NULL;
3631     }
3632
3633   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3634   if (!def_stmt_info
3635       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3636     return NULL;
3637
3638   bool nested_in_vect_loop
3639     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3640   unsigned nlatch_def_loop_uses = 0;
3641   auto_vec<gphi *, 3> lcphis;
3642   bool inner_loop_of_double_reduc = false;
3643   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3644     {
3645       gimple *use_stmt = USE_STMT (use_p);
3646       if (is_gimple_debug (use_stmt))
3647         continue;
3648       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3649         nlatch_def_loop_uses++;
3650       else
3651         {
3652           /* We can have more than one loop-closed PHI.  */
3653           lcphis.safe_push (as_a <gphi *> (use_stmt));
3654           if (nested_in_vect_loop
3655               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3656                   == vect_double_reduction_def))
3657             inner_loop_of_double_reduc = true;
3658         }
3659     }
3660
3661   /* If we are vectorizing an inner reduction we are executing that
3662      in the original order only in case we are not dealing with a
3663      double reduction.  */
3664   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3665     {
3666       if (dump_enabled_p ())
3667         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3668                         "detected nested cycle: ");
3669       return def_stmt_info;
3670     }
3671
3672   /* If this isn't a nested cycle or if the nested cycle reduction value
3673      is used ouside of the inner loop we cannot handle uses of the reduction
3674      value.  */
3675   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3676     {
3677       if (dump_enabled_p ())
3678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3679                          "reduction used in loop.\n");
3680       return NULL;
3681     }
3682
3683   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3684      defined in the inner loop.  */
3685   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3686     {
3687       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3688       if (gimple_phi_num_args (def_stmt) != 1
3689           || TREE_CODE (op1) != SSA_NAME)
3690         {
3691           if (dump_enabled_p ())
3692             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3693                              "unsupported phi node definition.\n");
3694
3695           return NULL;
3696         }
3697
3698       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3699       if (gimple_bb (def1)
3700           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3701           && loop->inner
3702           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3703           && is_gimple_assign (def1)
3704           && is_a <gphi *> (phi_use_stmt)
3705           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3706         {
3707           if (dump_enabled_p ())
3708             report_vect_op (MSG_NOTE, def_stmt,
3709                             "detected double reduction: ");
3710
3711           *double_reduc = true;
3712           return def_stmt_info;
3713         }
3714
3715       return NULL;
3716     }
3717
3718   /* Look for the expression computing latch_def from then loop PHI result.  */
3719   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3720   enum tree_code code;
3721   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3722                             path))
3723     {
3724       STMT_VINFO_REDUC_CODE (phi_info) = code;
3725       if (code == COND_EXPR && !nested_in_vect_loop)
3726         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3727
3728       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3729          reduction chain for which the additional restriction is that
3730          all operations in the chain are the same.  */
3731       auto_vec<stmt_vec_info, 8> reduc_chain;
3732       unsigned i;
3733       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3734       for (i = path.length () - 1; i >= 1; --i)
3735         {
3736           gimple *stmt = USE_STMT (path[i].second);
3737           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3738           STMT_VINFO_REDUC_IDX (stmt_info)
3739             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3740           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3741           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3742                                      && (i == 1 || i == path.length () - 1));
3743           if ((stmt_code != code && !leading_conversion)
3744               /* We can only handle the final value in epilogue
3745                  generation for reduction chains.  */
3746               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3747             is_slp_reduc = false;
3748           /* For reduction chains we support a trailing/leading
3749              conversions.  We do not store those in the actual chain.  */
3750           if (leading_conversion)
3751             continue;
3752           reduc_chain.safe_push (stmt_info);
3753         }
3754       if (is_slp_reduc && reduc_chain.length () > 1)
3755         {
3756           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3757             {
3758               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3759               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3760             }
3761           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3762           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3763
3764           /* Save the chain for further analysis in SLP detection.  */
3765           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3766           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3767
3768           *reduc_chain_p = true;
3769           if (dump_enabled_p ())
3770             dump_printf_loc (MSG_NOTE, vect_location,
3771                             "reduction: detected reduction chain\n");
3772         }
3773       else if (dump_enabled_p ())
3774         dump_printf_loc (MSG_NOTE, vect_location,
3775                          "reduction: detected reduction\n");
3776
3777       return def_stmt_info;
3778     }
3779
3780   if (dump_enabled_p ())
3781     dump_printf_loc (MSG_NOTE, vect_location,
3782                      "reduction: unknown pattern\n");
3783
3784   return NULL;
3785 }
3786
3787 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3788    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3789    or -1 if not known.  */
3790
3791 static int
3792 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3793 {
3794   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3795   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3796     {
3797       if (dump_enabled_p ())
3798         dump_printf_loc (MSG_NOTE, vect_location,
3799                          "cost model: epilogue peel iters set to vf/2 "
3800                          "because loop iterations are unknown .\n");
3801       return assumed_vf / 2;
3802     }
3803   else
3804     {
3805       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3806       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3807       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3808       /* If we need to peel for gaps, but no peeling is required, we have to
3809          peel VF iterations.  */
3810       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3811         peel_iters_epilogue = assumed_vf;
3812       return peel_iters_epilogue;
3813     }
3814 }
3815
3816 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3817 int
3818 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3819                              int *peel_iters_epilogue,
3820                              stmt_vector_for_cost *scalar_cost_vec,
3821                              stmt_vector_for_cost *prologue_cost_vec,
3822                              stmt_vector_for_cost *epilogue_cost_vec)
3823 {
3824   int retval = 0;
3825
3826   *peel_iters_epilogue
3827     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3828
3829   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3830     {
3831       /* If peeled iterations are known but number of scalar loop
3832          iterations are unknown, count a taken branch per peeled loop.  */
3833       if (peel_iters_prologue > 0)
3834         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3835                                    NULL, NULL_TREE, 0, vect_prologue);
3836       if (*peel_iters_epilogue > 0)
3837         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3838                                     NULL, NULL_TREE, 0, vect_epilogue);
3839     }
3840
3841   stmt_info_for_cost *si;
3842   int j;
3843   if (peel_iters_prologue)
3844     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3845       retval += record_stmt_cost (prologue_cost_vec,
3846                                   si->count * peel_iters_prologue,
3847                                   si->kind, si->stmt_info, si->misalign,
3848                                   vect_prologue);
3849   if (*peel_iters_epilogue)
3850     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3851       retval += record_stmt_cost (epilogue_cost_vec,
3852                                   si->count * *peel_iters_epilogue,
3853                                   si->kind, si->stmt_info, si->misalign,
3854                                   vect_epilogue);
3855
3856   return retval;
3857 }
3858
3859 /* Function vect_estimate_min_profitable_iters
3860
3861    Return the number of iterations required for the vector version of the
3862    loop to be profitable relative to the cost of the scalar version of the
3863    loop.
3864
3865    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3866    of iterations for vectorization.  -1 value means loop vectorization
3867    is not profitable.  This returned value may be used for dynamic
3868    profitability check.
3869
3870    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3871    for static check against estimated number of iterations.  */
3872
3873 static void
3874 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3875                                     int *ret_min_profitable_niters,
3876                                     int *ret_min_profitable_estimate)
3877 {
3878   int min_profitable_iters;
3879   int min_profitable_estimate;
3880   int peel_iters_prologue;
3881   int peel_iters_epilogue;
3882   unsigned vec_inside_cost = 0;
3883   int vec_outside_cost = 0;
3884   unsigned vec_prologue_cost = 0;
3885   unsigned vec_epilogue_cost = 0;
3886   int scalar_single_iter_cost = 0;
3887   int scalar_outside_cost = 0;
3888   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3889   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3890   vector_costs *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3891
3892   /* Cost model disabled.  */
3893   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3894     {
3895       if (dump_enabled_p ())
3896         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3897       *ret_min_profitable_niters = 0;
3898       *ret_min_profitable_estimate = 0;
3899       return;
3900     }
3901
3902   /* Requires loop versioning tests to handle misalignment.  */
3903   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3904     {
3905       /*  FIXME: Make cost depend on complexity of individual check.  */
3906       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3907       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3908                             NULL, NULL_TREE, 0, vect_prologue);
3909       if (dump_enabled_p ())
3910         dump_printf (MSG_NOTE,
3911                      "cost model: Adding cost of checks for loop "
3912                      "versioning to treat misalignment.\n");
3913     }
3914
3915   /* Requires loop versioning with alias checks.  */
3916   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3917     {
3918       /*  FIXME: Make cost depend on complexity of individual check.  */
3919       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3920       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3921                             NULL, NULL_TREE, 0, vect_prologue);
3922       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3923       if (len)
3924         /* Count LEN - 1 ANDs and LEN comparisons.  */
3925         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3926                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3927       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3928       if (len)
3929         {
3930           /* Count LEN - 1 ANDs and LEN comparisons.  */
3931           unsigned int nstmts = len * 2 - 1;
3932           /* +1 for each bias that needs adding.  */
3933           for (unsigned int i = 0; i < len; ++i)
3934             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3935               nstmts += 1;
3936           (void) add_stmt_cost (target_cost_data, nstmts,
3937                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3938         }
3939       if (dump_enabled_p ())
3940         dump_printf (MSG_NOTE,
3941                      "cost model: Adding cost of checks for loop "
3942                      "versioning aliasing.\n");
3943     }
3944
3945   /* Requires loop versioning with niter checks.  */
3946   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3947     {
3948       /*  FIXME: Make cost depend on complexity of individual check.  */
3949       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3950                             NULL, NULL_TREE, 0, vect_prologue);
3951       if (dump_enabled_p ())
3952         dump_printf (MSG_NOTE,
3953                      "cost model: Adding cost of checks for loop "
3954                      "versioning niters.\n");
3955     }
3956
3957   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3958     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3959                           NULL, NULL_TREE, 0, vect_prologue);
3960
3961   /* Count statements in scalar loop.  Using this as scalar cost for a single
3962      iteration for now.
3963
3964      TODO: Add outer loop support.
3965
3966      TODO: Consider assigning different costs to different scalar
3967      statements.  */
3968
3969   scalar_single_iter_cost
3970     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3971
3972   /* Add additional cost for the peeled instructions in prologue and epilogue
3973      loop.  (For fully-masked loops there will be no peeling.)
3974
3975      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3976      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3977
3978      TODO: Build an expression that represents peel_iters for prologue and
3979      epilogue to be used in a run-time test.  */
3980
3981   bool prologue_need_br_taken_cost = false;
3982   bool prologue_need_br_not_taken_cost = false;
3983
3984   /* Calculate peel_iters_prologue.  */
3985   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3986     peel_iters_prologue = 0;
3987   else if (npeel < 0)
3988     {
3989       peel_iters_prologue = assumed_vf / 2;
3990       if (dump_enabled_p ())
3991         dump_printf (MSG_NOTE, "cost model: "
3992                      "prologue peel iters set to vf/2.\n");
3993
3994       /* If peeled iterations are unknown, count a taken branch and a not taken
3995          branch per peeled loop.  Even if scalar loop iterations are known,
3996          vector iterations are not known since peeled prologue iterations are
3997          not known.  Hence guards remain the same.  */
3998       prologue_need_br_taken_cost = true;
3999       prologue_need_br_not_taken_cost = true;
4000     }
4001   else
4002     {
4003       peel_iters_prologue = npeel;
4004       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4005         /* If peeled iterations are known but number of scalar loop
4006            iterations are unknown, count a taken branch per peeled loop.  */
4007         prologue_need_br_taken_cost = true;
4008     }
4009
4010   bool epilogue_need_br_taken_cost = false;
4011   bool epilogue_need_br_not_taken_cost = false;
4012
4013   /* Calculate peel_iters_epilogue.  */
4014   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4015     /* We need to peel exactly one iteration for gaps.  */
4016     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4017   else if (npeel < 0)
4018     {
4019       /* If peeling for alignment is unknown, loop bound of main loop
4020          becomes unknown.  */
4021       peel_iters_epilogue = assumed_vf / 2;
4022       if (dump_enabled_p ())
4023         dump_printf (MSG_NOTE, "cost model: "
4024                      "epilogue peel iters set to vf/2 because "
4025                      "peeling for alignment is unknown.\n");
4026
4027       /* See the same reason above in peel_iters_prologue calculation.  */
4028       epilogue_need_br_taken_cost = true;
4029       epilogue_need_br_not_taken_cost = true;
4030     }
4031   else
4032     {
4033       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4034       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4035         /* If peeled iterations are known but number of scalar loop
4036            iterations are unknown, count a taken branch per peeled loop.  */
4037         epilogue_need_br_taken_cost = true;
4038     }
4039
4040   stmt_info_for_cost *si;
4041   int j;
4042   /* Add costs associated with peel_iters_prologue.  */
4043   if (peel_iters_prologue)
4044     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4045       {
4046         (void) add_stmt_cost (target_cost_data,
4047                               si->count * peel_iters_prologue, si->kind,
4048                               si->stmt_info, si->vectype, si->misalign,
4049                               vect_prologue);
4050       }
4051
4052   /* Add costs associated with peel_iters_epilogue.  */
4053   if (peel_iters_epilogue)
4054     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4055       {
4056         (void) add_stmt_cost (target_cost_data,
4057                               si->count * peel_iters_epilogue, si->kind,
4058                               si->stmt_info, si->vectype, si->misalign,
4059                               vect_epilogue);
4060       }
4061
4062   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4063
4064   if (prologue_need_br_taken_cost)
4065     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4066                           NULL, NULL_TREE, 0, vect_prologue);
4067
4068   if (prologue_need_br_not_taken_cost)
4069     (void) add_stmt_cost (target_cost_data, 1,
4070                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4071                           vect_prologue);
4072
4073   if (epilogue_need_br_taken_cost)
4074     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4075                           NULL, NULL_TREE, 0, vect_epilogue);
4076
4077   if (epilogue_need_br_not_taken_cost)
4078     (void) add_stmt_cost (target_cost_data, 1,
4079                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4080                           vect_epilogue);
4081
4082   /* Take care of special costs for rgroup controls of partial vectors.  */
4083   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4084     {
4085       /* Calculate how many masks we need to generate.  */
4086       unsigned int num_masks = 0;
4087       rgroup_controls *rgm;
4088       unsigned int num_vectors_m1;
4089       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4090         if (rgm->type)
4091           num_masks += num_vectors_m1 + 1;
4092       gcc_assert (num_masks > 0);
4093
4094       /* In the worst case, we need to generate each mask in the prologue
4095          and in the loop body.  One of the loop body mask instructions
4096          replaces the comparison in the scalar loop, and since we don't
4097          count the scalar comparison against the scalar body, we shouldn't
4098          count that vector instruction against the vector body either.
4099
4100          Sometimes we can use unpacks instead of generating prologue
4101          masks and sometimes the prologue mask will fold to a constant,
4102          so the actual prologue cost might be smaller.  However, it's
4103          simpler and safer to use the worst-case cost; if this ends up
4104          being the tie-breaker between vectorizing or not, then it's
4105          probably better not to vectorize.  */
4106       (void) add_stmt_cost (target_cost_data, num_masks,
4107                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4108       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4109                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4110     }
4111   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4112     {
4113       /* Referring to the functions vect_set_loop_condition_partial_vectors
4114          and vect_set_loop_controls_directly, we need to generate each
4115          length in the prologue and in the loop body if required. Although
4116          there are some possible optimizations, we consider the worst case
4117          here.  */
4118
4119       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4120       bool need_iterate_p
4121         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4122            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4123
4124       /* Calculate how many statements to be added.  */
4125       unsigned int prologue_stmts = 0;
4126       unsigned int body_stmts = 0;
4127
4128       rgroup_controls *rgc;
4129       unsigned int num_vectors_m1;
4130       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4131         if (rgc->type)
4132           {
4133             /* May need one SHIFT for nitems_total computation.  */
4134             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4135             if (nitems != 1 && !niters_known_p)
4136               prologue_stmts += 1;
4137
4138             /* May need one MAX and one MINUS for wrap around.  */
4139             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4140               prologue_stmts += 2;
4141
4142             /* Need one MAX and one MINUS for each batch limit excepting for
4143                the 1st one.  */
4144             prologue_stmts += num_vectors_m1 * 2;
4145
4146             unsigned int num_vectors = num_vectors_m1 + 1;
4147
4148             /* Need to set up lengths in prologue, only one MIN required
4149                for each since start index is zero.  */
4150             prologue_stmts += num_vectors;
4151
4152             /* Each may need two MINs and one MINUS to update lengths in body
4153                for next iteration.  */
4154             if (need_iterate_p)
4155               body_stmts += 3 * num_vectors;
4156           }
4157
4158       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4159                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4160       (void) add_stmt_cost (target_cost_data, body_stmts,
4161                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4162     }
4163
4164   /* FORNOW: The scalar outside cost is incremented in one of the
4165      following ways:
4166
4167      1. The vectorizer checks for alignment and aliasing and generates
4168      a condition that allows dynamic vectorization.  A cost model
4169      check is ANDED with the versioning condition.  Hence scalar code
4170      path now has the added cost of the versioning check.
4171
4172        if (cost > th & versioning_check)
4173          jmp to vector code
4174
4175      Hence run-time scalar is incremented by not-taken branch cost.
4176
4177      2. The vectorizer then checks if a prologue is required.  If the
4178      cost model check was not done before during versioning, it has to
4179      be done before the prologue check.
4180
4181        if (cost <= th)
4182          prologue = scalar_iters
4183        if (prologue == 0)
4184          jmp to vector code
4185        else
4186          execute prologue
4187        if (prologue == num_iters)
4188          go to exit
4189
4190      Hence the run-time scalar cost is incremented by a taken branch,
4191      plus a not-taken branch, plus a taken branch cost.
4192
4193      3. The vectorizer then checks if an epilogue is required.  If the
4194      cost model check was not done before during prologue check, it
4195      has to be done with the epilogue check.
4196
4197        if (prologue == 0)
4198          jmp to vector code
4199        else
4200          execute prologue
4201        if (prologue == num_iters)
4202          go to exit
4203        vector code:
4204          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4205            jmp to epilogue
4206
4207      Hence the run-time scalar cost should be incremented by 2 taken
4208      branches.
4209
4210      TODO: The back end may reorder the BBS's differently and reverse
4211      conditions/branch directions.  Change the estimates below to
4212      something more reasonable.  */
4213
4214   /* If the number of iterations is known and we do not do versioning, we can
4215      decide whether to vectorize at compile time.  Hence the scalar version
4216      do not carry cost model guard costs.  */
4217   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4218       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4219     {
4220       /* Cost model check occurs at versioning.  */
4221       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4222         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4223       else
4224         {
4225           /* Cost model check occurs at prologue generation.  */
4226           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4227             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4228               + vect_get_stmt_cost (cond_branch_not_taken);
4229           /* Cost model check occurs at epilogue generation.  */
4230           else
4231             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4232         }
4233     }
4234
4235   /* Complete the target-specific cost calculations.  */
4236   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4237                &vec_inside_cost, &vec_epilogue_cost);
4238
4239   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4240
4241   /* Stash the costs so that we can compare two loop_vec_infos.  */
4242   loop_vinfo->vec_inside_cost = vec_inside_cost;
4243   loop_vinfo->vec_outside_cost = vec_outside_cost;
4244
4245   if (dump_enabled_p ())
4246     {
4247       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4248       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4249                    vec_inside_cost);
4250       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4251                    vec_prologue_cost);
4252       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4253                    vec_epilogue_cost);
4254       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4255                    scalar_single_iter_cost);
4256       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4257                    scalar_outside_cost);
4258       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4259                    vec_outside_cost);
4260       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4261                    peel_iters_prologue);
4262       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4263                    peel_iters_epilogue);
4264     }
4265
4266   /* Calculate number of iterations required to make the vector version
4267      profitable, relative to the loop bodies only.  The following condition
4268      must hold true:
4269      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4270      where
4271      SIC = scalar iteration cost, VIC = vector iteration cost,
4272      VOC = vector outside cost, VF = vectorization factor,
4273      NPEEL = prologue iterations + epilogue iterations,
4274      SOC = scalar outside cost for run time cost model check.  */
4275
4276   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4277                           - vec_inside_cost);
4278   if (saving_per_viter <= 0)
4279     {
4280       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4281         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4282                     "vectorization did not happen for a simd loop");
4283
4284       if (dump_enabled_p ())
4285         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286                          "cost model: the vector iteration cost = %d "
4287                          "divided by the scalar iteration cost = %d "
4288                          "is greater or equal to the vectorization factor = %d"
4289                          ".\n",
4290                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4291       *ret_min_profitable_niters = -1;
4292       *ret_min_profitable_estimate = -1;
4293       return;
4294     }
4295
4296   /* ??? The "if" arm is written to handle all cases; see below for what
4297      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4298   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4299     {
4300       /* Rewriting the condition above in terms of the number of
4301          vector iterations (vniters) rather than the number of
4302          scalar iterations (niters) gives:
4303
4304          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4305
4306          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4307
4308          For integer N, X and Y when X > 0:
4309
4310          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4311       int outside_overhead = (vec_outside_cost
4312                               - scalar_single_iter_cost * peel_iters_prologue
4313                               - scalar_single_iter_cost * peel_iters_epilogue
4314                               - scalar_outside_cost);
4315       /* We're only interested in cases that require at least one
4316          vector iteration.  */
4317       int min_vec_niters = 1;
4318       if (outside_overhead > 0)
4319         min_vec_niters = outside_overhead / saving_per_viter + 1;
4320
4321       if (dump_enabled_p ())
4322         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4323                      min_vec_niters);
4324
4325       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4326         {
4327           /* Now that we know the minimum number of vector iterations,
4328              find the minimum niters for which the scalar cost is larger:
4329
4330              SIC * niters > VIC * vniters + VOC - SOC
4331
4332              We know that the minimum niters is no more than
4333              vniters * VF + NPEEL, but it might be (and often is) less
4334              than that if a partial vector iteration is cheaper than the
4335              equivalent scalar code.  */
4336           int threshold = (vec_inside_cost * min_vec_niters
4337                            + vec_outside_cost
4338                            - scalar_outside_cost);
4339           if (threshold <= 0)
4340             min_profitable_iters = 1;
4341           else
4342             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4343         }
4344       else
4345         /* Convert the number of vector iterations into a number of
4346            scalar iterations.  */
4347         min_profitable_iters = (min_vec_niters * assumed_vf
4348                                 + peel_iters_prologue
4349                                 + peel_iters_epilogue);
4350     }
4351   else
4352     {
4353       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4354                               * assumed_vf
4355                               - vec_inside_cost * peel_iters_prologue
4356                               - vec_inside_cost * peel_iters_epilogue);
4357       if (min_profitable_iters <= 0)
4358         min_profitable_iters = 0;
4359       else
4360         {
4361           min_profitable_iters /= saving_per_viter;
4362
4363           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4364               <= (((int) vec_inside_cost * min_profitable_iters)
4365                   + (((int) vec_outside_cost - scalar_outside_cost)
4366                      * assumed_vf)))
4367             min_profitable_iters++;
4368         }
4369     }
4370
4371   if (dump_enabled_p ())
4372     dump_printf (MSG_NOTE,
4373                  "  Calculated minimum iters for profitability: %d\n",
4374                  min_profitable_iters);
4375
4376   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4377       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4378     /* We want the vectorized loop to execute at least once.  */
4379     min_profitable_iters = assumed_vf + peel_iters_prologue;
4380   else if (min_profitable_iters < peel_iters_prologue)
4381     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4382        vectorized loop executes at least once.  */
4383     min_profitable_iters = peel_iters_prologue;
4384
4385   if (dump_enabled_p ())
4386     dump_printf_loc (MSG_NOTE, vect_location,
4387                      "  Runtime profitability threshold = %d\n",
4388                      min_profitable_iters);
4389
4390   *ret_min_profitable_niters = min_profitable_iters;
4391
4392   /* Calculate number of iterations required to make the vector version
4393      profitable, relative to the loop bodies only.
4394
4395      Non-vectorized variant is SIC * niters and it must win over vector
4396      variant on the expected loop trip count.  The following condition must hold true:
4397      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4398
4399   if (vec_outside_cost <= 0)
4400     min_profitable_estimate = 0;
4401   /* ??? This "else if" arm is written to handle all cases; see below for
4402      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4403   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4404     {
4405       /* This is a repeat of the code above, but with + SOC rather
4406          than - SOC.  */
4407       int outside_overhead = (vec_outside_cost
4408                               - scalar_single_iter_cost * peel_iters_prologue
4409                               - scalar_single_iter_cost * peel_iters_epilogue
4410                               + scalar_outside_cost);
4411       int min_vec_niters = 1;
4412       if (outside_overhead > 0)
4413         min_vec_niters = outside_overhead / saving_per_viter + 1;
4414
4415       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4416         {
4417           int threshold = (vec_inside_cost * min_vec_niters
4418                            + vec_outside_cost
4419                            + scalar_outside_cost);
4420           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4421         }
4422       else
4423         min_profitable_estimate = (min_vec_niters * assumed_vf
4424                                    + peel_iters_prologue
4425                                    + peel_iters_epilogue);
4426     }
4427   else
4428     {
4429       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4430                                  * assumed_vf
4431                                  - vec_inside_cost * peel_iters_prologue
4432                                  - vec_inside_cost * peel_iters_epilogue)
4433                                  / ((scalar_single_iter_cost * assumed_vf)
4434                                    - vec_inside_cost);
4435     }
4436   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4437   if (dump_enabled_p ())
4438     dump_printf_loc (MSG_NOTE, vect_location,
4439                      "  Static estimate profitability threshold = %d\n",
4440                      min_profitable_estimate);
4441
4442   *ret_min_profitable_estimate = min_profitable_estimate;
4443 }
4444
4445 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4446    vector elements (not bits) for a vector with NELT elements.  */
4447 static void
4448 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4449                               vec_perm_builder *sel)
4450 {
4451   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4452      by vec_perm_indices.  */
4453   sel->new_vector (nelt, 1, 3);
4454   for (unsigned int i = 0; i < 3; i++)
4455     sel->quick_push (i + offset);
4456 }
4457
4458 /* Checks whether the target supports whole-vector shifts for vectors of mode
4459    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4460    it supports vec_perm_const with masks for all necessary shift amounts.  */
4461 static bool
4462 have_whole_vector_shift (machine_mode mode)
4463 {
4464   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4465     return true;
4466
4467   /* Variable-length vectors should be handled via the optab.  */
4468   unsigned int nelt;
4469   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4470     return false;
4471
4472   vec_perm_builder sel;
4473   vec_perm_indices indices;
4474   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4475     {
4476       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4477       indices.new_vector (sel, 2, nelt);
4478       if (!can_vec_perm_const_p (mode, indices, false))
4479         return false;
4480     }
4481   return true;
4482 }
4483
4484 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4485    functions. Design better to avoid maintenance issues.  */
4486
4487 /* Function vect_model_reduction_cost.
4488
4489    Models cost for a reduction operation, including the vector ops
4490    generated within the strip-mine loop in some cases, the initial
4491    definition before the loop, and the epilogue code that must be generated.  */
4492
4493 static void
4494 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4495                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4496                            vect_reduction_type reduction_type,
4497                            int ncopies, stmt_vector_for_cost *cost_vec)
4498 {
4499   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4500   enum tree_code code;
4501   optab optab;
4502   tree vectype;
4503   machine_mode mode;
4504   class loop *loop = NULL;
4505
4506   if (loop_vinfo)
4507     loop = LOOP_VINFO_LOOP (loop_vinfo);
4508
4509   /* Condition reductions generate two reductions in the loop.  */
4510   if (reduction_type == COND_REDUCTION)
4511     ncopies *= 2;
4512
4513   vectype = STMT_VINFO_VECTYPE (stmt_info);
4514   mode = TYPE_MODE (vectype);
4515   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4516
4517   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4518
4519   if (reduction_type == EXTRACT_LAST_REDUCTION)
4520     /* No extra instructions are needed in the prologue.  The loop body
4521        operations are costed in vectorizable_condition.  */
4522     inside_cost = 0;
4523   else if (reduction_type == FOLD_LEFT_REDUCTION)
4524     {
4525       /* No extra instructions needed in the prologue.  */
4526       prologue_cost = 0;
4527
4528       if (reduc_fn != IFN_LAST)
4529         /* Count one reduction-like operation per vector.  */
4530         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4531                                         stmt_info, 0, vect_body);
4532       else
4533         {
4534           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4535           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4536           inside_cost = record_stmt_cost (cost_vec, nelements,
4537                                           vec_to_scalar, stmt_info, 0,
4538                                           vect_body);
4539           inside_cost += record_stmt_cost (cost_vec, nelements,
4540                                            scalar_stmt, stmt_info, 0,
4541                                            vect_body);
4542         }
4543     }
4544   else
4545     {
4546       /* Add in cost for initial definition.
4547          For cond reduction we have four vectors: initial index, step,
4548          initial result of the data reduction, initial value of the index
4549          reduction.  */
4550       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4551       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4552                                          scalar_to_vec, stmt_info, 0,
4553                                          vect_prologue);
4554     }
4555
4556   /* Determine cost of epilogue code.
4557
4558      We have a reduction operator that will reduce the vector in one statement.
4559      Also requires scalar extract.  */
4560
4561   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4562     {
4563       if (reduc_fn != IFN_LAST)
4564         {
4565           if (reduction_type == COND_REDUCTION)
4566             {
4567               /* An EQ stmt and an COND_EXPR stmt.  */
4568               epilogue_cost += record_stmt_cost (cost_vec, 2,
4569                                                  vector_stmt, stmt_info, 0,
4570                                                  vect_epilogue);
4571               /* Reduction of the max index and a reduction of the found
4572                  values.  */
4573               epilogue_cost += record_stmt_cost (cost_vec, 2,
4574                                                  vec_to_scalar, stmt_info, 0,
4575                                                  vect_epilogue);
4576               /* A broadcast of the max value.  */
4577               epilogue_cost += record_stmt_cost (cost_vec, 1,
4578                                                  scalar_to_vec, stmt_info, 0,
4579                                                  vect_epilogue);
4580             }
4581           else
4582             {
4583               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4584                                                  stmt_info, 0, vect_epilogue);
4585               epilogue_cost += record_stmt_cost (cost_vec, 1,
4586                                                  vec_to_scalar, stmt_info, 0,
4587                                                  vect_epilogue);
4588             }
4589         }
4590       else if (reduction_type == COND_REDUCTION)
4591         {
4592           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4593           /* Extraction of scalar elements.  */
4594           epilogue_cost += record_stmt_cost (cost_vec,
4595                                              2 * estimated_nunits,
4596                                              vec_to_scalar, stmt_info, 0,
4597                                              vect_epilogue);
4598           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4599           epilogue_cost += record_stmt_cost (cost_vec,
4600                                              2 * estimated_nunits - 3,
4601                                              scalar_stmt, stmt_info, 0,
4602                                              vect_epilogue);
4603         }
4604       else if (reduction_type == EXTRACT_LAST_REDUCTION
4605                || reduction_type == FOLD_LEFT_REDUCTION)
4606         /* No extra instructions need in the epilogue.  */
4607         ;
4608       else
4609         {
4610           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4611           tree bitsize =
4612             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4613           int element_bitsize = tree_to_uhwi (bitsize);
4614           int nelements = vec_size_in_bits / element_bitsize;
4615
4616           if (code == COND_EXPR)
4617             code = MAX_EXPR;
4618
4619           optab = optab_for_tree_code (code, vectype, optab_default);
4620
4621           /* We have a whole vector shift available.  */
4622           if (optab != unknown_optab
4623               && VECTOR_MODE_P (mode)
4624               && optab_handler (optab, mode) != CODE_FOR_nothing
4625               && have_whole_vector_shift (mode))
4626             {
4627               /* Final reduction via vector shifts and the reduction operator.
4628                  Also requires scalar extract.  */
4629               epilogue_cost += record_stmt_cost (cost_vec,
4630                                                  exact_log2 (nelements) * 2,
4631                                                  vector_stmt, stmt_info, 0,
4632                                                  vect_epilogue);
4633               epilogue_cost += record_stmt_cost (cost_vec, 1,
4634                                                  vec_to_scalar, stmt_info, 0,
4635                                                  vect_epilogue);
4636             }
4637           else
4638             /* Use extracts and reduction op for final reduction.  For N
4639                elements, we have N extracts and N-1 reduction ops.  */
4640             epilogue_cost += record_stmt_cost (cost_vec,
4641                                                nelements + nelements - 1,
4642                                                vector_stmt, stmt_info, 0,
4643                                                vect_epilogue);
4644         }
4645     }
4646
4647   if (dump_enabled_p ())
4648     dump_printf (MSG_NOTE,
4649                  "vect_model_reduction_cost: inside_cost = %d, "
4650                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4651                  prologue_cost, epilogue_cost);
4652 }
4653
4654 /* SEQ is a sequence of instructions that initialize the reduction
4655    described by REDUC_INFO.  Emit them in the appropriate place.  */
4656
4657 static void
4658 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4659                                 stmt_vec_info reduc_info, gimple *seq)
4660 {
4661   if (reduc_info->reused_accumulator)
4662     {
4663       /* When reusing an accumulator from the main loop, we only need
4664          initialization instructions if the main loop can be skipped.
4665          In that case, emit the initialization instructions at the end
4666          of the guard block that does the skip.  */
4667       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4668       gcc_assert (skip_edge);
4669       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4670       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4671     }
4672   else
4673     {
4674       /* The normal case: emit the initialization instructions on the
4675          preheader edge.  */
4676       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4677       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4678     }
4679 }
4680
4681 /* Function get_initial_def_for_reduction
4682
4683    Input:
4684    REDUC_INFO - the info_for_reduction
4685    INIT_VAL - the initial value of the reduction variable
4686    NEUTRAL_OP - a value that has no effect on the reduction, as per
4687                 neutral_op_for_reduction
4688
4689    Output:
4690    Return a vector variable, initialized according to the operation that
4691         STMT_VINFO performs. This vector will be used as the initial value
4692         of the vector of partial results.
4693
4694    The value we need is a vector in which element 0 has value INIT_VAL
4695    and every other element has value NEUTRAL_OP.  */
4696
4697 static tree
4698 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4699                                stmt_vec_info reduc_info,
4700                                tree init_val, tree neutral_op)
4701 {
4702   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4703   tree scalar_type = TREE_TYPE (init_val);
4704   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4705   tree init_def;
4706   gimple_seq stmts = NULL;
4707
4708   gcc_assert (vectype);
4709
4710   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4711               || SCALAR_FLOAT_TYPE_P (scalar_type));
4712
4713   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4714               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4715
4716   if (operand_equal_p (init_val, neutral_op))
4717     {
4718       /* If both elements are equal then the vector described above is
4719          just a splat.  */
4720       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4721       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4722     }
4723   else
4724     {
4725       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4726       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4727       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4728         {
4729           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4730              element 0.  */
4731           init_def = gimple_build_vector_from_val (&stmts, vectype,
4732                                                    neutral_op);
4733           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4734                                    vectype, init_def, init_val);
4735         }
4736       else
4737         {
4738           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4739           tree_vector_builder elts (vectype, 1, 2);
4740           elts.quick_push (init_val);
4741           elts.quick_push (neutral_op);
4742           init_def = gimple_build_vector (&stmts, &elts);
4743         }
4744     }
4745
4746   if (stmts)
4747     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4748   return init_def;
4749 }
4750
4751 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4752    which performs a reduction involving GROUP_SIZE scalar statements.
4753    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4754    is nonnull, introducing extra elements of that value will not change the
4755    result.  */
4756
4757 static void
4758 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4759                                 stmt_vec_info reduc_info,
4760                                 vec<tree> *vec_oprnds,
4761                                 unsigned int number_of_vectors,
4762                                 unsigned int group_size, tree neutral_op)
4763 {
4764   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4765   unsigned HOST_WIDE_INT nunits;
4766   unsigned j, number_of_places_left_in_vector;
4767   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4768   unsigned int i;
4769
4770   gcc_assert (group_size == initial_values.length () || neutral_op);
4771
4772   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4773      created vectors. It is greater than 1 if unrolling is performed.
4774
4775      For example, we have two scalar operands, s1 and s2 (e.g., group of
4776      strided accesses of size two), while NUNITS is four (i.e., four scalars
4777      of this type can be packed in a vector).  The output vector will contain
4778      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4779      will be 2).
4780
4781      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4782      vectors containing the operands.
4783
4784      For example, NUNITS is four as before, and the group size is 8
4785      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4786      {s5, s6, s7, s8}.  */
4787
4788   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4789     nunits = group_size;
4790
4791   number_of_places_left_in_vector = nunits;
4792   bool constant_p = true;
4793   tree_vector_builder elts (vector_type, nunits, 1);
4794   elts.quick_grow (nunits);
4795   gimple_seq ctor_seq = NULL;
4796   for (j = 0; j < nunits * number_of_vectors; ++j)
4797     {
4798       tree op;
4799       i = j % group_size;
4800
4801       /* Get the def before the loop.  In reduction chain we have only
4802          one initial value.  Else we have as many as PHIs in the group.  */
4803       if (i >= initial_values.length () || (j > i && neutral_op))
4804         op = neutral_op;
4805       else
4806         op = initial_values[i];
4807
4808       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4809       number_of_places_left_in_vector--;
4810       elts[nunits - number_of_places_left_in_vector - 1] = op;
4811       if (!CONSTANT_CLASS_P (op))
4812         constant_p = false;
4813
4814       if (number_of_places_left_in_vector == 0)
4815         {
4816           tree init;
4817           if (constant_p && !neutral_op
4818               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4819               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4820             /* Build the vector directly from ELTS.  */
4821             init = gimple_build_vector (&ctor_seq, &elts);
4822           else if (neutral_op)
4823             {
4824               /* Build a vector of the neutral value and shift the
4825                  other elements into place.  */
4826               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4827                                                    neutral_op);
4828               int k = nunits;
4829               while (k > 0 && elts[k - 1] == neutral_op)
4830                 k -= 1;
4831               while (k > 0)
4832                 {
4833                   k -= 1;
4834                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4835                                        vector_type, init, elts[k]);
4836                 }
4837             }
4838           else
4839             {
4840               /* First time round, duplicate ELTS to fill the
4841                  required number of vectors.  */
4842               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4843                                         elts, number_of_vectors, *vec_oprnds);
4844               break;
4845             }
4846           vec_oprnds->quick_push (init);
4847
4848           number_of_places_left_in_vector = nunits;
4849           elts.new_vector (vector_type, nunits, 1);
4850           elts.quick_grow (nunits);
4851           constant_p = true;
4852         }
4853     }
4854   if (ctor_seq != NULL)
4855     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4856 }
4857
4858 /* For a statement STMT_INFO taking part in a reduction operation return
4859    the stmt_vec_info the meta information is stored on.  */
4860
4861 stmt_vec_info
4862 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4863 {
4864   stmt_info = vect_orig_stmt (stmt_info);
4865   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4866   if (!is_a <gphi *> (stmt_info->stmt)
4867       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4868     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4869   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4870   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4871     {
4872       if (gimple_phi_num_args (phi) == 1)
4873         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4874     }
4875   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4876     {
4877       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4878       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4879         stmt_info = info;
4880     }
4881   return stmt_info;
4882 }
4883
4884 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4885    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4886    return false.  */
4887
4888 static bool
4889 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4890                                 stmt_vec_info reduc_info)
4891 {
4892   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4893   if (!main_loop_vinfo)
4894     return false;
4895
4896   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4897     return false;
4898
4899   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4900   auto_vec<tree, 16> main_loop_results (num_phis);
4901   auto_vec<tree, 16> initial_values (num_phis);
4902   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4903     {
4904       /* The epilogue loop can be entered either from the main loop or
4905          from an earlier guard block.  */
4906       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4907       for (tree incoming_value : reduc_info->reduc_initial_values)
4908         {
4909           /* Look for:
4910
4911                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4912                                     INITIAL_VALUE(guard block)>.  */
4913           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4914
4915           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4916           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4917
4918           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4919           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4920
4921           main_loop_results.quick_push (from_main_loop);
4922           initial_values.quick_push (from_skip);
4923         }
4924     }
4925   else
4926     /* The main loop dominates the epilogue loop.  */
4927     main_loop_results.splice (reduc_info->reduc_initial_values);
4928
4929   /* See if the main loop has the kind of accumulator we need.  */
4930   vect_reusable_accumulator *accumulator
4931     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4932   if (!accumulator
4933       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4934       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4935                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4936     return false;
4937
4938   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4939   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4940   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4941   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4942                             TYPE_VECTOR_SUBPARTS (vectype)))
4943     return false;
4944
4945   /* Non-SLP reductions might apply an adjustment after the reduction
4946      operation, in order to simplify the initialization of the accumulator.
4947      If the epilogue loop carries on from where the main loop left off,
4948      it should apply the same adjustment to the final reduction result.
4949
4950      If the epilogue loop can also be entered directly (rather than via
4951      the main loop), we need to be able to handle that case in the same way,
4952      with the same adjustment.  (In principle we could add a PHI node
4953      to select the correct adjustment, but in practice that shouldn't be
4954      necessary.)  */
4955   tree main_adjustment
4956     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4957   if (loop_vinfo->main_loop_edge && main_adjustment)
4958     {
4959       gcc_assert (num_phis == 1);
4960       tree initial_value = initial_values[0];
4961       /* Check that we can use INITIAL_VALUE as the adjustment and
4962          initialize the accumulator with a neutral value instead.  */
4963       if (!operand_equal_p (initial_value, main_adjustment))
4964         return false;
4965       tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4966       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4967                                                     code, initial_value);
4968     }
4969   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4970   reduc_info->reduc_initial_values.truncate (0);
4971   reduc_info->reduc_initial_values.splice (initial_values);
4972   reduc_info->reused_accumulator = accumulator;
4973   return true;
4974 }
4975
4976 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4977    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
4978
4979 static tree
4980 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4981                             gimple_seq *seq)
4982 {
4983   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4984   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4985   tree stype = TREE_TYPE (vectype);
4986   tree new_temp = vec_def;
4987   while (nunits > nunits1)
4988     {
4989       nunits /= 2;
4990       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4991                                                            stype, nunits);
4992       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4993
4994       /* The target has to make sure we support lowpart/highpart
4995          extraction, either via direct vector extract or through
4996          an integer mode punning.  */
4997       tree dst1, dst2;
4998       gimple *epilog_stmt;
4999       if (convert_optab_handler (vec_extract_optab,
5000                                  TYPE_MODE (TREE_TYPE (new_temp)),
5001                                  TYPE_MODE (vectype1))
5002           != CODE_FOR_nothing)
5003         {
5004           /* Extract sub-vectors directly once vec_extract becomes
5005              a conversion optab.  */
5006           dst1 = make_ssa_name (vectype1);
5007           epilog_stmt
5008               = gimple_build_assign (dst1, BIT_FIELD_REF,
5009                                      build3 (BIT_FIELD_REF, vectype1,
5010                                              new_temp, TYPE_SIZE (vectype1),
5011                                              bitsize_int (0)));
5012           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5013           dst2 =  make_ssa_name (vectype1);
5014           epilog_stmt
5015               = gimple_build_assign (dst2, BIT_FIELD_REF,
5016                                      build3 (BIT_FIELD_REF, vectype1,
5017                                              new_temp, TYPE_SIZE (vectype1),
5018                                              bitsize_int (bitsize)));
5019           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5020         }
5021       else
5022         {
5023           /* Extract via punning to appropriately sized integer mode
5024              vector.  */
5025           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5026           tree etype = build_vector_type (eltype, 2);
5027           gcc_assert (convert_optab_handler (vec_extract_optab,
5028                                              TYPE_MODE (etype),
5029                                              TYPE_MODE (eltype))
5030                       != CODE_FOR_nothing);
5031           tree tem = make_ssa_name (etype);
5032           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5033                                              build1 (VIEW_CONVERT_EXPR,
5034                                                      etype, new_temp));
5035           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5036           new_temp = tem;
5037           tem = make_ssa_name (eltype);
5038           epilog_stmt
5039               = gimple_build_assign (tem, BIT_FIELD_REF,
5040                                      build3 (BIT_FIELD_REF, eltype,
5041                                              new_temp, TYPE_SIZE (eltype),
5042                                              bitsize_int (0)));
5043           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5044           dst1 = make_ssa_name (vectype1);
5045           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5046                                              build1 (VIEW_CONVERT_EXPR,
5047                                                      vectype1, tem));
5048           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5049           tem = make_ssa_name (eltype);
5050           epilog_stmt
5051               = gimple_build_assign (tem, BIT_FIELD_REF,
5052                                      build3 (BIT_FIELD_REF, eltype,
5053                                              new_temp, TYPE_SIZE (eltype),
5054                                              bitsize_int (bitsize)));
5055           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5056           dst2 =  make_ssa_name (vectype1);
5057           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5058                                              build1 (VIEW_CONVERT_EXPR,
5059                                                      vectype1, tem));
5060           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5061         }
5062
5063       new_temp = make_ssa_name (vectype1);
5064       epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5065       gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5066     }
5067
5068   return new_temp;
5069 }
5070
5071 /* Function vect_create_epilog_for_reduction
5072
5073    Create code at the loop-epilog to finalize the result of a reduction
5074    computation.
5075
5076    STMT_INFO is the scalar reduction stmt that is being vectorized.
5077    SLP_NODE is an SLP node containing a group of reduction statements. The
5078      first one in this group is STMT_INFO.
5079    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5080    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5081      (counting from 0)
5082
5083    This function:
5084    1. Completes the reduction def-use cycles.
5085    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5086       by calling the function specified by REDUC_FN if available, or by
5087       other means (whole-vector shifts or a scalar loop).
5088       The function also creates a new phi node at the loop exit to preserve
5089       loop-closed form, as illustrated below.
5090
5091      The flow at the entry to this function:
5092
5093         loop:
5094           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5095           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5096           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5097         loop_exit:
5098           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5099           use <s_out0>
5100           use <s_out0>
5101
5102      The above is transformed by this function into:
5103
5104         loop:
5105           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5106           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5107           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5108         loop_exit:
5109           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5110           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5111           v_out2 = reduce <v_out1>
5112           s_out3 = extract_field <v_out2, 0>
5113           s_out4 = adjust_result <s_out3>
5114           use <s_out4>
5115           use <s_out4>
5116 */
5117
5118 static void
5119 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5120                                   stmt_vec_info stmt_info,
5121                                   slp_tree slp_node,
5122                                   slp_instance slp_node_instance)
5123 {
5124   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5125   gcc_assert (reduc_info->is_reduc_info);
5126   /* For double reductions we need to get at the inner loop reduction
5127      stmt which has the meta info attached.  Our stmt_info is that of the
5128      loop-closed PHI of the inner loop which we remember as
5129      def for the reduction PHI generation.  */
5130   bool double_reduc = false;
5131   stmt_vec_info rdef_info = stmt_info;
5132   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5133     {
5134       gcc_assert (!slp_node);
5135       double_reduc = true;
5136       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5137                                             (stmt_info->stmt, 0));
5138       stmt_info = vect_stmt_to_vectorize (stmt_info);
5139     }
5140   gphi *reduc_def_stmt
5141     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5142   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5143   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5144   tree vectype;
5145   machine_mode mode;
5146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5147   basic_block exit_bb;
5148   tree scalar_dest;
5149   tree scalar_type;
5150   gimple *new_phi = NULL, *phi;
5151   gimple_stmt_iterator exit_gsi;
5152   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5153   gimple *epilog_stmt = NULL;
5154   gimple *exit_phi;
5155   tree bitsize;
5156   tree def;
5157   tree orig_name, scalar_result;
5158   imm_use_iterator imm_iter, phi_imm_iter;
5159   use_operand_p use_p, phi_use_p;
5160   gimple *use_stmt;
5161   auto_vec<tree> reduc_inputs;
5162   int j, i;
5163   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5164   unsigned int group_size = 1, k;
5165   auto_vec<gimple *> phis;
5166   /* SLP reduction without reduction chain, e.g.,
5167      # a1 = phi <a2, a0>
5168      # b1 = phi <b2, b0>
5169      a2 = operation (a1)
5170      b2 = operation (b1)  */
5171   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5172   bool direct_slp_reduc;
5173   tree induction_index = NULL_TREE;
5174
5175   if (slp_node)
5176     group_size = SLP_TREE_LANES (slp_node);
5177
5178   if (nested_in_vect_loop_p (loop, stmt_info))
5179     {
5180       outer_loop = loop;
5181       loop = loop->inner;
5182       gcc_assert (!slp_node && double_reduc);
5183     }
5184
5185   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5186   gcc_assert (vectype);
5187   mode = TYPE_MODE (vectype);
5188
5189   tree induc_val = NULL_TREE;
5190   tree adjustment_def = NULL;
5191   if (slp_node)
5192     ;
5193   else
5194     {
5195       /* Optimize: for induction condition reduction, if we can't use zero
5196          for induc_val, use initial_def.  */
5197       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5198         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5199       else if (double_reduc)
5200         ;
5201       else
5202         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5203     }
5204
5205   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5206   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5207   if (slp_reduc)
5208     /* All statements produce live-out values.  */
5209     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5210   else if (slp_node)
5211     /* The last statement in the reduction chain produces the live-out
5212        value.  */
5213     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5214
5215   unsigned vec_num;
5216   int ncopies;
5217   if (slp_node)
5218     {
5219       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5220       ncopies = 1;
5221     }
5222   else
5223     {
5224       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5225       vec_num = 1;
5226       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5227     }
5228
5229   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5230      which is updated with the current index of the loop for every match of
5231      the original loop's cond_expr (VEC_STMT).  This results in a vector
5232      containing the last time the condition passed for that vector lane.
5233      The first match will be a 1 to allow 0 to be used for non-matching
5234      indexes.  If there are no matches at all then the vector will be all
5235      zeroes.
5236
5237      PR92772: This algorithm is broken for architectures that support
5238      masked vectors, but do not provide fold_extract_last.  */
5239   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5240     {
5241       auto_vec<std::pair<tree, bool>, 2> ccompares;
5242       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5243       cond_info = vect_stmt_to_vectorize (cond_info);
5244       while (cond_info != reduc_info)
5245         {
5246           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5247             {
5248               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5249               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5250               ccompares.safe_push
5251                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5252                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5253             }
5254           cond_info
5255             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5256                                                  1 + STMT_VINFO_REDUC_IDX
5257                                                         (cond_info)));
5258           cond_info = vect_stmt_to_vectorize (cond_info);
5259         }
5260       gcc_assert (ccompares.length () != 0);
5261
5262       tree indx_before_incr, indx_after_incr;
5263       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5264       int scalar_precision
5265         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5266       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5267       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5268         (TYPE_MODE (vectype), cr_index_scalar_type,
5269          TYPE_VECTOR_SUBPARTS (vectype));
5270
5271       /* First we create a simple vector induction variable which starts
5272          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5273          vector size (STEP).  */
5274
5275       /* Create a {1,2,3,...} vector.  */
5276       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5277
5278       /* Create a vector of the step value.  */
5279       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5280       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5281
5282       /* Create an induction variable.  */
5283       gimple_stmt_iterator incr_gsi;
5284       bool insert_after;
5285       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5286       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5287                  insert_after, &indx_before_incr, &indx_after_incr);
5288
5289       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5290          filled with zeros (VEC_ZERO).  */
5291
5292       /* Create a vector of 0s.  */
5293       tree zero = build_zero_cst (cr_index_scalar_type);
5294       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5295
5296       /* Create a vector phi node.  */
5297       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5298       new_phi = create_phi_node (new_phi_tree, loop->header);
5299       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5300                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5301
5302       /* Now take the condition from the loops original cond_exprs
5303          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5304          every match uses values from the induction variable
5305          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5306          (NEW_PHI_TREE).
5307          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5308          the new cond_expr (INDEX_COND_EXPR).  */
5309       gimple_seq stmts = NULL;
5310       for (int i = ccompares.length () - 1; i != -1; --i)
5311         {
5312           tree ccompare = ccompares[i].first;
5313           if (ccompares[i].second)
5314             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5315                                          cr_index_vector_type,
5316                                          ccompare,
5317                                          indx_before_incr, new_phi_tree);
5318           else
5319             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5320                                          cr_index_vector_type,
5321                                          ccompare,
5322                                          new_phi_tree, indx_before_incr);
5323         }
5324       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5325
5326       /* Update the phi with the vec cond.  */
5327       induction_index = new_phi_tree;
5328       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5329                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5330     }
5331
5332   /* 2. Create epilog code.
5333         The reduction epilog code operates across the elements of the vector
5334         of partial results computed by the vectorized loop.
5335         The reduction epilog code consists of:
5336
5337         step 1: compute the scalar result in a vector (v_out2)
5338         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5339         step 3: adjust the scalar result (s_out3) if needed.
5340
5341         Step 1 can be accomplished using one the following three schemes:
5342           (scheme 1) using reduc_fn, if available.
5343           (scheme 2) using whole-vector shifts, if available.
5344           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5345                      combined.
5346
5347           The overall epilog code looks like this:
5348
5349           s_out0 = phi <s_loop>         # original EXIT_PHI
5350           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5351           v_out2 = reduce <v_out1>              # step 1
5352           s_out3 = extract_field <v_out2, 0>    # step 2
5353           s_out4 = adjust_result <s_out3>       # step 3
5354
5355           (step 3 is optional, and steps 1 and 2 may be combined).
5356           Lastly, the uses of s_out0 are replaced by s_out4.  */
5357
5358
5359   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5360          v_out1 = phi <VECT_DEF>
5361          Store them in NEW_PHIS.  */
5362   if (double_reduc)
5363     loop = outer_loop;
5364   exit_bb = single_exit (loop)->dest;
5365   exit_gsi = gsi_after_labels (exit_bb);
5366   reduc_inputs.create (slp_node ? vec_num : ncopies);
5367   for (unsigned i = 0; i < vec_num; i++)
5368     {
5369       gimple_seq stmts = NULL;
5370       if (slp_node)
5371         def = vect_get_slp_vect_def (slp_node, i);
5372       else
5373         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5374       for (j = 0; j < ncopies; j++)
5375         {
5376           tree new_def = copy_ssa_name (def);
5377           phi = create_phi_node (new_def, exit_bb);
5378           if (j)
5379             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5380           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5381           new_def = gimple_convert (&stmts, vectype, new_def);
5382           reduc_inputs.quick_push (new_def);
5383         }
5384       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5385     }
5386
5387   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5388          (i.e. when reduc_fn is not available) and in the final adjustment
5389          code (if needed).  Also get the original scalar reduction variable as
5390          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5391          represents a reduction pattern), the tree-code and scalar-def are
5392          taken from the original stmt that the pattern-stmt (STMT) replaces.
5393          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5394          are taken from STMT.  */
5395
5396   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5397   if (orig_stmt_info != stmt_info)
5398     {
5399       /* Reduction pattern  */
5400       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5401       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5402     }
5403
5404   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5405   scalar_type = TREE_TYPE (scalar_dest);
5406   scalar_results.create (group_size);
5407   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5408   bitsize = TYPE_SIZE (scalar_type);
5409
5410   /* True if we should implement SLP_REDUC using native reduction operations
5411      instead of scalar operations.  */
5412   direct_slp_reduc = (reduc_fn != IFN_LAST
5413                       && slp_reduc
5414                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5415
5416   /* In case of reduction chain, e.g.,
5417      # a1 = phi <a3, a0>
5418      a2 = operation (a1)
5419      a3 = operation (a2),
5420
5421      we may end up with more than one vector result.  Here we reduce them
5422      to one vector.
5423
5424      The same is true if we couldn't use a single defuse cycle.  */
5425   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5426       || direct_slp_reduc
5427       || ncopies > 1)
5428     {
5429       gimple_seq stmts = NULL;
5430       tree single_input = reduc_inputs[0];
5431       for (k = 1; k < reduc_inputs.length (); k++)
5432         single_input = gimple_build (&stmts, code, vectype,
5433                                      single_input, reduc_inputs[k]);
5434       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5435
5436       reduc_inputs.truncate (0);
5437       reduc_inputs.safe_push (single_input);
5438     }
5439
5440   tree orig_reduc_input = reduc_inputs[0];
5441
5442   /* If this loop is an epilogue loop that can be skipped after the
5443      main loop, we can only share a reduction operation between the
5444      main loop and the epilogue if we put it at the target of the
5445      skip edge.
5446
5447      We can still reuse accumulators if this check fails.  Doing so has
5448      the minor(?) benefit of making the epilogue loop's scalar result
5449      independent of the main loop's scalar result.  */
5450   bool unify_with_main_loop_p = false;
5451   if (reduc_info->reused_accumulator
5452       && loop_vinfo->skip_this_loop_edge
5453       && single_succ_p (exit_bb)
5454       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5455     {
5456       unify_with_main_loop_p = true;
5457
5458       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5459       reduc_inputs[0] = make_ssa_name (vectype);
5460       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5461       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5462                    UNKNOWN_LOCATION);
5463       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5464                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5465       exit_gsi = gsi_after_labels (reduc_block);
5466     }
5467
5468   /* Shouldn't be used beyond this point.  */
5469   exit_bb = nullptr;
5470
5471   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5472       && reduc_fn != IFN_LAST)
5473     {
5474       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5475          various data values where the condition matched and another vector
5476          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5477          need to extract the last matching index (which will be the index with
5478          highest value) and use this to index into the data vector.
5479          For the case where there were no matches, the data vector will contain
5480          all default values and the index vector will be all zeros.  */
5481
5482       /* Get various versions of the type of the vector of indexes.  */
5483       tree index_vec_type = TREE_TYPE (induction_index);
5484       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5485       tree index_scalar_type = TREE_TYPE (index_vec_type);
5486       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5487
5488       /* Get an unsigned integer version of the type of the data vector.  */
5489       int scalar_precision
5490         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5491       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5492       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5493                                                 vectype);
5494
5495       /* First we need to create a vector (ZERO_VEC) of zeros and another
5496          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5497          can create using a MAX reduction and then expanding.
5498          In the case where the loop never made any matches, the max index will
5499          be zero.  */
5500
5501       /* Vector of {0, 0, 0,...}.  */
5502       tree zero_vec = build_zero_cst (vectype);
5503
5504       /* Find maximum value from the vector of found indexes.  */
5505       tree max_index = make_ssa_name (index_scalar_type);
5506       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5507                                                           1, induction_index);
5508       gimple_call_set_lhs (max_index_stmt, max_index);
5509       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5510
5511       /* Vector of {max_index, max_index, max_index,...}.  */
5512       tree max_index_vec = make_ssa_name (index_vec_type);
5513       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5514                                                       max_index);
5515       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5516                                                         max_index_vec_rhs);
5517       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5518
5519       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5520          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5521          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5522          otherwise.  Only one value should match, resulting in a vector
5523          (VEC_COND) with one data value and the rest zeros.
5524          In the case where the loop never made any matches, every index will
5525          match, resulting in a vector with all data values (which will all be
5526          the default value).  */
5527
5528       /* Compare the max index vector to the vector of found indexes to find
5529          the position of the max value.  */
5530       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5531       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5532                                                       induction_index,
5533                                                       max_index_vec);
5534       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5535
5536       /* Use the compare to choose either values from the data vector or
5537          zero.  */
5538       tree vec_cond = make_ssa_name (vectype);
5539       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5540                                                    vec_compare,
5541                                                    reduc_inputs[0],
5542                                                    zero_vec);
5543       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5544
5545       /* Finally we need to extract the data value from the vector (VEC_COND)
5546          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5547          reduction, but because this doesn't exist, we can use a MAX reduction
5548          instead.  The data value might be signed or a float so we need to cast
5549          it first.
5550          In the case where the loop never made any matches, the data values are
5551          all identical, and so will reduce down correctly.  */
5552
5553       /* Make the matched data values unsigned.  */
5554       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5555       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5556                                        vec_cond);
5557       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5558                                                         VIEW_CONVERT_EXPR,
5559                                                         vec_cond_cast_rhs);
5560       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5561
5562       /* Reduce down to a scalar value.  */
5563       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5564       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5565                                                            1, vec_cond_cast);
5566       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5567       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5568
5569       /* Convert the reduced value back to the result type and set as the
5570          result.  */
5571       gimple_seq stmts = NULL;
5572       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5573                                data_reduc);
5574       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5575       scalar_results.safe_push (new_temp);
5576     }
5577   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5578            && reduc_fn == IFN_LAST)
5579     {
5580       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5581          idx = 0;
5582          idx_val = induction_index[0];
5583          val = data_reduc[0];
5584          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5585            if (induction_index[i] > idx_val)
5586              val = data_reduc[i], idx_val = induction_index[i];
5587          return val;  */
5588
5589       tree data_eltype = TREE_TYPE (vectype);
5590       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5591       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5592       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5593       /* Enforced by vectorizable_reduction, which ensures we have target
5594          support before allowing a conditional reduction on variable-length
5595          vectors.  */
5596       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5597       tree idx_val = NULL_TREE, val = NULL_TREE;
5598       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5599         {
5600           tree old_idx_val = idx_val;
5601           tree old_val = val;
5602           idx_val = make_ssa_name (idx_eltype);
5603           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5604                                              build3 (BIT_FIELD_REF, idx_eltype,
5605                                                      induction_index,
5606                                                      bitsize_int (el_size),
5607                                                      bitsize_int (off)));
5608           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5609           val = make_ssa_name (data_eltype);
5610           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5611                                              build3 (BIT_FIELD_REF,
5612                                                      data_eltype,
5613                                                      reduc_inputs[0],
5614                                                      bitsize_int (el_size),
5615                                                      bitsize_int (off)));
5616           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617           if (off != 0)
5618             {
5619               tree new_idx_val = idx_val;
5620               if (off != v_size - el_size)
5621                 {
5622                   new_idx_val = make_ssa_name (idx_eltype);
5623                   epilog_stmt = gimple_build_assign (new_idx_val,
5624                                                      MAX_EXPR, idx_val,
5625                                                      old_idx_val);
5626                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5627                 }
5628               tree new_val = make_ssa_name (data_eltype);
5629               epilog_stmt = gimple_build_assign (new_val,
5630                                                  COND_EXPR,
5631                                                  build2 (GT_EXPR,
5632                                                          boolean_type_node,
5633                                                          idx_val,
5634                                                          old_idx_val),
5635                                                  val, old_val);
5636               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5637               idx_val = new_idx_val;
5638               val = new_val;
5639             }
5640         }
5641       /* Convert the reduced value back to the result type and set as the
5642          result.  */
5643       gimple_seq stmts = NULL;
5644       val = gimple_convert (&stmts, scalar_type, val);
5645       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5646       scalar_results.safe_push (val);
5647     }
5648
5649   /* 2.3 Create the reduction code, using one of the three schemes described
5650          above. In SLP we simply need to extract all the elements from the
5651          vector (without reducing them), so we use scalar shifts.  */
5652   else if (reduc_fn != IFN_LAST && !slp_reduc)
5653     {
5654       tree tmp;
5655       tree vec_elem_type;
5656
5657       /* Case 1:  Create:
5658          v_out2 = reduc_expr <v_out1>  */
5659
5660       if (dump_enabled_p ())
5661         dump_printf_loc (MSG_NOTE, vect_location,
5662                          "Reduce using direct vector reduction.\n");
5663
5664       gimple_seq stmts = NULL;
5665       vec_elem_type = TREE_TYPE (vectype);
5666       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5667                                vec_elem_type, reduc_inputs[0]);
5668       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5669       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5670
5671       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5672           && induc_val)
5673         {
5674           /* Earlier we set the initial value to be a vector if induc_val
5675              values.  Check the result and if it is induc_val then replace
5676              with the original initial value, unless induc_val is
5677              the same as initial_def already.  */
5678           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5679                                   induc_val);
5680           tree initial_def = reduc_info->reduc_initial_values[0];
5681
5682           tmp = make_ssa_name (new_scalar_dest);
5683           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5684                                              initial_def, new_temp);
5685           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5686           new_temp = tmp;
5687         }
5688
5689       scalar_results.safe_push (new_temp);
5690     }
5691   else if (direct_slp_reduc)
5692     {
5693       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5694          with the elements for other SLP statements replaced with the
5695          neutral value.  We can then do a normal reduction on each vector.  */
5696
5697       /* Enforced by vectorizable_reduction.  */
5698       gcc_assert (reduc_inputs.length () == 1);
5699       gcc_assert (pow2p_hwi (group_size));
5700
5701       gimple_seq seq = NULL;
5702
5703       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5704          and the same element size as VECTYPE.  */
5705       tree index = build_index_vector (vectype, 0, 1);
5706       tree index_type = TREE_TYPE (index);
5707       tree index_elt_type = TREE_TYPE (index_type);
5708       tree mask_type = truth_type_for (index_type);
5709
5710       /* Create a vector that, for each element, identifies which of
5711          the REDUC_GROUP_SIZE results should use it.  */
5712       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5713       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5714                             build_vector_from_val (index_type, index_mask));
5715
5716       /* Get a neutral vector value.  This is simply a splat of the neutral
5717          scalar value if we have one, otherwise the initial scalar value
5718          is itself a neutral value.  */
5719       tree vector_identity = NULL_TREE;
5720       tree neutral_op = NULL_TREE;
5721       if (slp_node)
5722         {
5723           tree initial_value = NULL_TREE;
5724           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5725             initial_value = reduc_info->reduc_initial_values[0];
5726           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5727                                                  initial_value);
5728         }
5729       if (neutral_op)
5730         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5731                                                         neutral_op);
5732       for (unsigned int i = 0; i < group_size; ++i)
5733         {
5734           /* If there's no univeral neutral value, we can use the
5735              initial scalar value from the original PHI.  This is used
5736              for MIN and MAX reduction, for example.  */
5737           if (!neutral_op)
5738             {
5739               tree scalar_value = reduc_info->reduc_initial_values[i];
5740               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5741                                              scalar_value);
5742               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5743                                                               scalar_value);
5744             }
5745
5746           /* Calculate the equivalent of:
5747
5748              sel[j] = (index[j] == i);
5749
5750              which selects the elements of REDUC_INPUTS[0] that should
5751              be included in the result.  */
5752           tree compare_val = build_int_cst (index_elt_type, i);
5753           compare_val = build_vector_from_val (index_type, compare_val);
5754           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5755                                    index, compare_val);
5756
5757           /* Calculate the equivalent of:
5758
5759              vec = seq ? reduc_inputs[0] : vector_identity;
5760
5761              VEC is now suitable for a full vector reduction.  */
5762           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5763                                    sel, reduc_inputs[0], vector_identity);
5764
5765           /* Do the reduction and convert it to the appropriate type.  */
5766           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5767                                       TREE_TYPE (vectype), vec);
5768           scalar = gimple_convert (&seq, scalar_type, scalar);
5769           scalar_results.safe_push (scalar);
5770         }
5771       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5772     }
5773   else
5774     {
5775       bool reduce_with_shift;
5776       tree vec_temp;
5777
5778       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5779
5780       /* See if the target wants to do the final (shift) reduction
5781          in a vector mode of smaller size and first reduce upper/lower
5782          halves against each other.  */
5783       enum machine_mode mode1 = mode;
5784       tree stype = TREE_TYPE (vectype);
5785       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5786       unsigned nunits1 = nunits;
5787       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5788           && reduc_inputs.length () == 1)
5789         {
5790           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5791           /* For SLP reductions we have to make sure lanes match up, but
5792              since we're doing individual element final reduction reducing
5793              vector width here is even more important.
5794              ???  We can also separate lanes with permutes, for the common
5795              case of power-of-two group-size odd/even extracts would work.  */
5796           if (slp_reduc && nunits != nunits1)
5797             {
5798               nunits1 = least_common_multiple (nunits1, group_size);
5799               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5800             }
5801         }
5802       if (!slp_reduc
5803           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5804         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5805
5806       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5807                                                            stype, nunits1);
5808       reduce_with_shift = have_whole_vector_shift (mode1);
5809       if (!VECTOR_MODE_P (mode1))
5810         reduce_with_shift = false;
5811       else
5812         {
5813           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5814           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5815             reduce_with_shift = false;
5816         }
5817
5818       /* First reduce the vector to the desired vector size we should
5819          do shift reduction on by combining upper and lower halves.  */
5820       gimple_seq stmts = NULL;
5821       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5822                                              code, &stmts);
5823       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5824       reduc_inputs[0] = new_temp;
5825
5826       if (reduce_with_shift && !slp_reduc)
5827         {
5828           int element_bitsize = tree_to_uhwi (bitsize);
5829           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5830              for variable-length vectors and also requires direct target support
5831              for loop reductions.  */
5832           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5833           int nelements = vec_size_in_bits / element_bitsize;
5834           vec_perm_builder sel;
5835           vec_perm_indices indices;
5836
5837           int elt_offset;
5838
5839           tree zero_vec = build_zero_cst (vectype1);
5840           /* Case 2: Create:
5841              for (offset = nelements/2; offset >= 1; offset/=2)
5842                 {
5843                   Create:  va' = vec_shift <va, offset>
5844                   Create:  va = vop <va, va'>
5845                 }  */
5846
5847           tree rhs;
5848
5849           if (dump_enabled_p ())
5850             dump_printf_loc (MSG_NOTE, vect_location,
5851                              "Reduce using vector shifts\n");
5852
5853           gimple_seq stmts = NULL;
5854           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5855           for (elt_offset = nelements / 2;
5856                elt_offset >= 1;
5857                elt_offset /= 2)
5858             {
5859               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5860               indices.new_vector (sel, 2, nelements);
5861               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5862               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5863                                        new_temp, zero_vec, mask);
5864               new_temp = gimple_build (&stmts, code,
5865                                        vectype1, new_name, new_temp);
5866             }
5867           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5868
5869           /* 2.4  Extract the final scalar result.  Create:
5870              s_out3 = extract_field <v_out2, bitpos>  */
5871
5872           if (dump_enabled_p ())
5873             dump_printf_loc (MSG_NOTE, vect_location,
5874                              "extract scalar result\n");
5875
5876           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5877                         bitsize, bitsize_zero_node);
5878           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5879           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5880           gimple_assign_set_lhs (epilog_stmt, new_temp);
5881           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5882           scalar_results.safe_push (new_temp);
5883         }
5884       else
5885         {
5886           /* Case 3: Create:
5887              s = extract_field <v_out2, 0>
5888              for (offset = element_size;
5889                   offset < vector_size;
5890                   offset += element_size;)
5891                {
5892                  Create:  s' = extract_field <v_out2, offset>
5893                  Create:  s = op <s, s'>  // For non SLP cases
5894                }  */
5895
5896           if (dump_enabled_p ())
5897             dump_printf_loc (MSG_NOTE, vect_location,
5898                              "Reduce using scalar code.\n");
5899
5900           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5901           int element_bitsize = tree_to_uhwi (bitsize);
5902           tree compute_type = TREE_TYPE (vectype);
5903           gimple_seq stmts = NULL;
5904           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5905             {
5906               int bit_offset;
5907               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5908                                        vec_temp, bitsize, bitsize_zero_node);
5909
5910               /* In SLP we don't need to apply reduction operation, so we just
5911                  collect s' values in SCALAR_RESULTS.  */
5912               if (slp_reduc)
5913                 scalar_results.safe_push (new_temp);
5914
5915               for (bit_offset = element_bitsize;
5916                    bit_offset < vec_size_in_bits;
5917                    bit_offset += element_bitsize)
5918                 {
5919                   tree bitpos = bitsize_int (bit_offset);
5920                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5921                                            compute_type, vec_temp,
5922                                            bitsize, bitpos);
5923                   if (slp_reduc)
5924                     {
5925                       /* In SLP we don't need to apply reduction operation, so
5926                          we just collect s' values in SCALAR_RESULTS.  */
5927                       new_temp = new_name;
5928                       scalar_results.safe_push (new_name);
5929                     }
5930                   else
5931                     new_temp = gimple_build (&stmts, code, compute_type,
5932                                              new_name, new_temp);
5933                 }
5934             }
5935
5936           /* The only case where we need to reduce scalar results in SLP, is
5937              unrolling.  If the size of SCALAR_RESULTS is greater than
5938              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5939              REDUC_GROUP_SIZE.  */
5940           if (slp_reduc)
5941             {
5942               tree res, first_res, new_res;
5943
5944               /* Reduce multiple scalar results in case of SLP unrolling.  */
5945               for (j = group_size; scalar_results.iterate (j, &res);
5946                    j++)
5947                 {
5948                   first_res = scalar_results[j % group_size];
5949                   new_res = gimple_build (&stmts, code, compute_type,
5950                                           first_res, res);
5951                   scalar_results[j % group_size] = new_res;
5952                 }
5953               scalar_results.truncate (group_size);
5954               for (k = 0; k < group_size; k++)
5955                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5956                                                     scalar_results[k]);
5957             }
5958           else
5959             {
5960               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5961               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5962               scalar_results.safe_push (new_temp);
5963             }
5964
5965           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5966         }
5967
5968       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5969           && induc_val)
5970         {
5971           /* Earlier we set the initial value to be a vector if induc_val
5972              values.  Check the result and if it is induc_val then replace
5973              with the original initial value, unless induc_val is
5974              the same as initial_def already.  */
5975           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5976                                   induc_val);
5977           tree initial_def = reduc_info->reduc_initial_values[0];
5978
5979           tree tmp = make_ssa_name (new_scalar_dest);
5980           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5981                                              initial_def, new_temp);
5982           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5983           scalar_results[0] = tmp;
5984         }
5985     }
5986
5987   /* 2.5 Adjust the final result by the initial value of the reduction
5988          variable. (When such adjustment is not needed, then
5989          'adjustment_def' is zero).  For example, if code is PLUS we create:
5990          new_temp = loop_exit_def + adjustment_def  */
5991
5992   if (adjustment_def)
5993     {
5994       gcc_assert (!slp_reduc);
5995       gimple_seq stmts = NULL;
5996       if (double_reduc)
5997         {
5998           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5999           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6000           new_temp = gimple_build (&stmts, code, vectype,
6001                                    reduc_inputs[0], adjustment_def);
6002         }
6003       else
6004         {
6005           new_temp = scalar_results[0];
6006           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6007           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6008           new_temp = gimple_build (&stmts, code, scalar_type,
6009                                    new_temp, adjustment_def);
6010         }
6011
6012       epilog_stmt = gimple_seq_last_stmt (stmts);
6013       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6014       scalar_results[0] = new_temp;
6015     }
6016
6017   /* Record this operation if it could be reused by the epilogue loop.  */
6018   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6019     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6020                                            { orig_reduc_input, reduc_info });
6021
6022   if (double_reduc)
6023     loop = outer_loop;
6024
6025   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6026           phis with new adjusted scalar results, i.e., replace use <s_out0>
6027           with use <s_out4>.
6028
6029      Transform:
6030         loop_exit:
6031           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6032           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6033           v_out2 = reduce <v_out1>
6034           s_out3 = extract_field <v_out2, 0>
6035           s_out4 = adjust_result <s_out3>
6036           use <s_out0>
6037           use <s_out0>
6038
6039      into:
6040
6041         loop_exit:
6042           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6043           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6044           v_out2 = reduce <v_out1>
6045           s_out3 = extract_field <v_out2, 0>
6046           s_out4 = adjust_result <s_out3>
6047           use <s_out4>
6048           use <s_out4> */
6049
6050   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6051   for (k = 0; k < live_out_stmts.size (); k++)
6052     {
6053       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6054       scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6055
6056       phis.create (3);
6057       /* Find the loop-closed-use at the loop exit of the original scalar
6058          result.  (The reduction result is expected to have two immediate uses,
6059          one at the latch block, and one at the loop exit).  For double
6060          reductions we are looking for exit phis of the outer loop.  */
6061       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6062         {
6063           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6064             {
6065               if (!is_gimple_debug (USE_STMT (use_p)))
6066                 phis.safe_push (USE_STMT (use_p));
6067             }
6068           else
6069             {
6070               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6071                 {
6072                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6073
6074                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6075                     {
6076                       if (!flow_bb_inside_loop_p (loop,
6077                                              gimple_bb (USE_STMT (phi_use_p)))
6078                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6079                         phis.safe_push (USE_STMT (phi_use_p));
6080                     }
6081                 }
6082             }
6083         }
6084
6085       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6086         {
6087           /* Replace the uses:  */
6088           orig_name = PHI_RESULT (exit_phi);
6089
6090           /* Look for a single use at the target of the skip edge.  */
6091           if (unify_with_main_loop_p)
6092             {
6093               use_operand_p use_p;
6094               gimple *user;
6095               if (!single_imm_use (orig_name, &use_p, &user))
6096                 gcc_unreachable ();
6097               orig_name = gimple_get_lhs (user);
6098             }
6099
6100           scalar_result = scalar_results[k];
6101           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6102             {
6103               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6104                 SET_USE (use_p, scalar_result);
6105               update_stmt (use_stmt);
6106             }
6107         }
6108
6109       phis.release ();
6110     }
6111 }
6112
6113 /* Return a vector of type VECTYPE that is equal to the vector select
6114    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6115    before GSI.  */
6116
6117 static tree
6118 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6119                      tree vec, tree identity)
6120 {
6121   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6122   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6123                                           mask, vec, identity);
6124   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6125   return cond;
6126 }
6127
6128 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6129    order, starting with LHS.  Insert the extraction statements before GSI and
6130    associate the new scalar SSA names with variable SCALAR_DEST.
6131    Return the SSA name for the result.  */
6132
6133 static tree
6134 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6135                        tree_code code, tree lhs, tree vector_rhs)
6136 {
6137   tree vectype = TREE_TYPE (vector_rhs);
6138   tree scalar_type = TREE_TYPE (vectype);
6139   tree bitsize = TYPE_SIZE (scalar_type);
6140   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6141   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6142
6143   for (unsigned HOST_WIDE_INT bit_offset = 0;
6144        bit_offset < vec_size_in_bits;
6145        bit_offset += element_bitsize)
6146     {
6147       tree bitpos = bitsize_int (bit_offset);
6148       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6149                          bitsize, bitpos);
6150
6151       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6152       rhs = make_ssa_name (scalar_dest, stmt);
6153       gimple_assign_set_lhs (stmt, rhs);
6154       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6155
6156       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6157       tree new_name = make_ssa_name (scalar_dest, stmt);
6158       gimple_assign_set_lhs (stmt, new_name);
6159       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6160       lhs = new_name;
6161     }
6162   return lhs;
6163 }
6164
6165 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6166    type of the vector input.  */
6167
6168 static internal_fn
6169 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6170 {
6171   internal_fn mask_reduc_fn;
6172
6173   switch (reduc_fn)
6174     {
6175     case IFN_FOLD_LEFT_PLUS:
6176       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6177       break;
6178
6179     default:
6180       return IFN_LAST;
6181     }
6182
6183   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6184                                       OPTIMIZE_FOR_SPEED))
6185     return mask_reduc_fn;
6186   return IFN_LAST;
6187 }
6188
6189 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6190    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6191    statement.  CODE is the operation performed by STMT_INFO and OPS are
6192    its scalar operands.  REDUC_INDEX is the index of the operand in
6193    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6194    implements in-order reduction, or IFN_LAST if we should open-code it.
6195    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6196    that should be used to control the operation in a fully-masked loop.  */
6197
6198 static bool
6199 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6200                                stmt_vec_info stmt_info,
6201                                gimple_stmt_iterator *gsi,
6202                                gimple **vec_stmt, slp_tree slp_node,
6203                                gimple *reduc_def_stmt,
6204                                tree_code code, internal_fn reduc_fn,
6205                                tree ops[3], tree vectype_in,
6206                                int reduc_index, vec_loop_masks *masks)
6207 {
6208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6209   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6210   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6211
6212   int ncopies;
6213   if (slp_node)
6214     ncopies = 1;
6215   else
6216     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6217
6218   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6219   gcc_assert (ncopies == 1);
6220   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6221
6222   if (slp_node)
6223     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6224                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6225
6226   tree op0 = ops[1 - reduc_index];
6227
6228   int group_size = 1;
6229   stmt_vec_info scalar_dest_def_info;
6230   auto_vec<tree> vec_oprnds0;
6231   if (slp_node)
6232     {
6233       auto_vec<vec<tree> > vec_defs (2);
6234       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6235       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6236       vec_defs[0].release ();
6237       vec_defs[1].release ();
6238       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6239       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6240     }
6241   else
6242     {
6243       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6244                                      op0, &vec_oprnds0);
6245       scalar_dest_def_info = stmt_info;
6246     }
6247
6248   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6249   tree scalar_type = TREE_TYPE (scalar_dest);
6250   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6251
6252   int vec_num = vec_oprnds0.length ();
6253   gcc_assert (vec_num == 1 || slp_node);
6254   tree vec_elem_type = TREE_TYPE (vectype_out);
6255   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6256
6257   tree vector_identity = NULL_TREE;
6258   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6259     vector_identity = build_zero_cst (vectype_out);
6260
6261   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6262   int i;
6263   tree def0;
6264   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6265     {
6266       gimple *new_stmt;
6267       tree mask = NULL_TREE;
6268       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6269         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6270
6271       /* Handle MINUS by adding the negative.  */
6272       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6273         {
6274           tree negated = make_ssa_name (vectype_out);
6275           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6276           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6277           def0 = negated;
6278         }
6279
6280       if (mask && mask_reduc_fn == IFN_LAST)
6281         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6282                                     vector_identity);
6283
6284       /* On the first iteration the input is simply the scalar phi
6285          result, and for subsequent iterations it is the output of
6286          the preceding operation.  */
6287       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6288         {
6289           if (mask && mask_reduc_fn != IFN_LAST)
6290             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6291                                                    def0, mask);
6292           else
6293             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6294                                                    def0);
6295           /* For chained SLP reductions the output of the previous reduction
6296              operation serves as the input of the next. For the final statement
6297              the output cannot be a temporary - we reuse the original
6298              scalar destination of the last statement.  */
6299           if (i != vec_num - 1)
6300             {
6301               gimple_set_lhs (new_stmt, scalar_dest_var);
6302               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6303               gimple_set_lhs (new_stmt, reduc_var);
6304             }
6305         }
6306       else
6307         {
6308           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6309                                              reduc_var, def0);
6310           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6311           /* Remove the statement, so that we can use the same code paths
6312              as for statements that we've just created.  */
6313           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6314           gsi_remove (&tmp_gsi, true);
6315         }
6316
6317       if (i == vec_num - 1)
6318         {
6319           gimple_set_lhs (new_stmt, scalar_dest);
6320           vect_finish_replace_stmt (loop_vinfo,
6321                                     scalar_dest_def_info,
6322                                     new_stmt);
6323         }
6324       else
6325         vect_finish_stmt_generation (loop_vinfo,
6326                                      scalar_dest_def_info,
6327                                      new_stmt, gsi);
6328
6329       if (slp_node)
6330         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6331       else
6332         {
6333           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6334           *vec_stmt = new_stmt;
6335         }
6336     }
6337
6338   return true;
6339 }
6340
6341 /* Function is_nonwrapping_integer_induction.
6342
6343    Check if STMT_VINO (which is part of loop LOOP) both increments and
6344    does not cause overflow.  */
6345
6346 static bool
6347 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6348 {
6349   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6350   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6351   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6352   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6353   widest_int ni, max_loop_value, lhs_max;
6354   wi::overflow_type overflow = wi::OVF_NONE;
6355
6356   /* Make sure the loop is integer based.  */
6357   if (TREE_CODE (base) != INTEGER_CST
6358       || TREE_CODE (step) != INTEGER_CST)
6359     return false;
6360
6361   /* Check that the max size of the loop will not wrap.  */
6362
6363   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6364     return true;
6365
6366   if (! max_stmt_executions (loop, &ni))
6367     return false;
6368
6369   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6370                             &overflow);
6371   if (overflow)
6372     return false;
6373
6374   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6375                             TYPE_SIGN (lhs_type), &overflow);
6376   if (overflow)
6377     return false;
6378
6379   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6380           <= TYPE_PRECISION (lhs_type));
6381 }
6382
6383 /* Check if masking can be supported by inserting a conditional expression.
6384    CODE is the code for the operation.  COND_FN is the conditional internal
6385    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6386 static bool
6387 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6388                          tree vectype_in)
6389 {
6390   if (cond_fn != IFN_LAST
6391       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6392                                          OPTIMIZE_FOR_SPEED))
6393     return false;
6394
6395   switch (code)
6396     {
6397     case DOT_PROD_EXPR:
6398     case SAD_EXPR:
6399       return true;
6400
6401     default:
6402       return false;
6403     }
6404 }
6405
6406 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6407    code for the operation.  VOP is the array of operands.  MASK is the loop
6408    mask.  GSI is a statement iterator used to place the new conditional
6409    expression.  */
6410 static void
6411 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6412                       gimple_stmt_iterator *gsi)
6413 {
6414   switch (code)
6415     {
6416     case DOT_PROD_EXPR:
6417       {
6418         tree vectype = TREE_TYPE (vop[1]);
6419         tree zero = build_zero_cst (vectype);
6420         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6421         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6422                                                mask, vop[1], zero);
6423         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6424         vop[1] = masked_op1;
6425         break;
6426       }
6427
6428     case SAD_EXPR:
6429       {
6430         tree vectype = TREE_TYPE (vop[1]);
6431         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6432         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6433                                                mask, vop[1], vop[0]);
6434         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6435         vop[1] = masked_op1;
6436         break;
6437       }
6438
6439     default:
6440       gcc_unreachable ();
6441     }
6442 }
6443
6444 /* Function vectorizable_reduction.
6445
6446    Check if STMT_INFO performs a reduction operation that can be vectorized.
6447    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6448    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6449    Return true if STMT_INFO is vectorizable in this way.
6450
6451    This function also handles reduction idioms (patterns) that have been
6452    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6453    may be of this form:
6454      X = pattern_expr (arg0, arg1, ..., X)
6455    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6456    sequence that had been detected and replaced by the pattern-stmt
6457    (STMT_INFO).
6458
6459    This function also handles reduction of condition expressions, for example:
6460      for (int i = 0; i < N; i++)
6461        if (a[i] < value)
6462          last = a[i];
6463    This is handled by vectorising the loop and creating an additional vector
6464    containing the loop indexes for which "a[i] < value" was true.  In the
6465    function epilogue this is reduced to a single max value and then used to
6466    index into the vector of results.
6467
6468    In some cases of reduction patterns, the type of the reduction variable X is
6469    different than the type of the other arguments of STMT_INFO.
6470    In such cases, the vectype that is used when transforming STMT_INFO into
6471    a vector stmt is different than the vectype that is used to determine the
6472    vectorization factor, because it consists of a different number of elements
6473    than the actual number of elements that are being operated upon in parallel.
6474
6475    For example, consider an accumulation of shorts into an int accumulator.
6476    On some targets it's possible to vectorize this pattern operating on 8
6477    shorts at a time (hence, the vectype for purposes of determining the
6478    vectorization factor should be V8HI); on the other hand, the vectype that
6479    is used to create the vector form is actually V4SI (the type of the result).
6480
6481    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6482    indicates what is the actual level of parallelism (V8HI in the example), so
6483    that the right vectorization factor would be derived.  This vectype
6484    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6485    be used to create the vectorized stmt.  The right vectype for the vectorized
6486    stmt is obtained from the type of the result X:
6487       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6488
6489    This means that, contrary to "regular" reductions (or "regular" stmts in
6490    general), the following equation:
6491       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6492    does *NOT* necessarily hold for reduction patterns.  */
6493
6494 bool
6495 vectorizable_reduction (loop_vec_info loop_vinfo,
6496                         stmt_vec_info stmt_info, slp_tree slp_node,
6497                         slp_instance slp_node_instance,
6498                         stmt_vector_for_cost *cost_vec)
6499 {
6500   tree scalar_dest;
6501   tree vectype_in = NULL_TREE;
6502   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6503   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6504   stmt_vec_info cond_stmt_vinfo = NULL;
6505   tree scalar_type;
6506   int i;
6507   int ncopies;
6508   bool single_defuse_cycle = false;
6509   bool nested_cycle = false;
6510   bool double_reduc = false;
6511   int vec_num;
6512   tree tem;
6513   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6514   tree cond_reduc_val = NULL_TREE;
6515
6516   /* Make sure it was already recognized as a reduction computation.  */
6517   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6518       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6519       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6520     return false;
6521
6522   /* The stmt we store reduction analysis meta on.  */
6523   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6524   reduc_info->is_reduc_info = true;
6525
6526   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6527     {
6528       if (is_a <gphi *> (stmt_info->stmt))
6529         {
6530           if (slp_node)
6531             {
6532               /* We eventually need to set a vector type on invariant
6533                  arguments.  */
6534               unsigned j;
6535               slp_tree child;
6536               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6537                 if (!vect_maybe_update_slp_op_vectype
6538                        (child, SLP_TREE_VECTYPE (slp_node)))
6539                   {
6540                     if (dump_enabled_p ())
6541                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6542                                        "incompatible vector types for "
6543                                        "invariants\n");
6544                     return false;
6545                   }
6546             }
6547           /* Analysis for double-reduction is done on the outer
6548              loop PHI, nested cycles have no further restrictions.  */
6549           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6550         }
6551       else
6552         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6553       return true;
6554     }
6555
6556   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6557   stmt_vec_info phi_info = stmt_info;
6558   if (!is_a <gphi *> (stmt_info->stmt))
6559     {
6560       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6561       return true;
6562     }
6563   if (slp_node)
6564     {
6565       slp_node_instance->reduc_phis = slp_node;
6566       /* ???  We're leaving slp_node to point to the PHIs, we only
6567          need it to get at the number of vector stmts which wasn't
6568          yet initialized for the instance root.  */
6569     }
6570   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6571     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6572   else
6573     {
6574       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6575                   == vect_double_reduction_def);
6576       use_operand_p use_p;
6577       gimple *use_stmt;
6578       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6579                                  &use_p, &use_stmt);
6580       gcc_assert (res);
6581       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6582       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6583     }
6584
6585   /* PHIs should not participate in patterns.  */
6586   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6587   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6588
6589   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6590      and compute the reduction chain length.  Discover the real
6591      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6592   tree reduc_def
6593     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6594                              loop_latch_edge
6595                                (gimple_bb (reduc_def_phi)->loop_father));
6596   unsigned reduc_chain_length = 0;
6597   bool only_slp_reduc_chain = true;
6598   stmt_info = NULL;
6599   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6600   while (reduc_def != PHI_RESULT (reduc_def_phi))
6601     {
6602       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6603       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6604       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6605         {
6606           if (dump_enabled_p ())
6607             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6608                              "reduction chain broken by patterns.\n");
6609           return false;
6610         }
6611       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6612         only_slp_reduc_chain = false;
6613       /* ???  For epilogue generation live members of the chain need
6614          to point back to the PHI via their original stmt for
6615          info_for_reduction to work.  */
6616       if (STMT_VINFO_LIVE_P (vdef))
6617         STMT_VINFO_REDUC_DEF (def) = phi_info;
6618       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6619       if (!assign)
6620         {
6621           if (dump_enabled_p ())
6622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623                              "reduction chain includes calls.\n");
6624           return false;
6625         }
6626       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6627         {
6628           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6629                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6630             {
6631               if (dump_enabled_p ())
6632                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6633                                  "conversion in the reduction chain.\n");
6634               return false;
6635             }
6636         }
6637       else if (!stmt_info)
6638         /* First non-conversion stmt.  */
6639         stmt_info = vdef;
6640       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6641       reduc_chain_length++;
6642       if (!stmt_info && slp_node)
6643         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6644     }
6645   /* PHIs should not participate in patterns.  */
6646   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6647
6648   if (nested_in_vect_loop_p (loop, stmt_info))
6649     {
6650       loop = loop->inner;
6651       nested_cycle = true;
6652     }
6653
6654   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6655      element.  */
6656   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6657     {
6658       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6659       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6660     }
6661   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6662     gcc_assert (slp_node
6663                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6664
6665   /* 1. Is vectorizable reduction?  */
6666   /* Not supportable if the reduction variable is used in the loop, unless
6667      it's a reduction chain.  */
6668   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6669       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6670     return false;
6671
6672   /* Reductions that are not used even in an enclosing outer-loop,
6673      are expected to be "live" (used out of the loop).  */
6674   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6675       && !STMT_VINFO_LIVE_P (stmt_info))
6676     return false;
6677
6678   /* 2. Has this been recognized as a reduction pattern?
6679
6680      Check if STMT represents a pattern that has been recognized
6681      in earlier analysis stages.  For stmts that represent a pattern,
6682      the STMT_VINFO_RELATED_STMT field records the last stmt in
6683      the original sequence that constitutes the pattern.  */
6684
6685   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6686   if (orig_stmt_info)
6687     {
6688       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6689       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6690     }
6691
6692   /* 3. Check the operands of the operation.  The first operands are defined
6693         inside the loop body. The last operand is the reduction variable,
6694         which is defined by the loop-header-phi.  */
6695
6696   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6697   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6698   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6699   enum tree_code code = gimple_assign_rhs_code (stmt);
6700   bool lane_reduc_code_p
6701     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6702   int op_type = TREE_CODE_LENGTH (code);
6703   enum optab_subtype optab_query_kind = optab_vector;
6704   if (code == DOT_PROD_EXPR
6705       && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6706            != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6707     optab_query_kind = optab_vector_mixed_sign;
6708
6709
6710   scalar_dest = gimple_assign_lhs (stmt);
6711   scalar_type = TREE_TYPE (scalar_dest);
6712   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6713       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6714     return false;
6715
6716   /* Do not try to vectorize bit-precision reductions.  */
6717   if (!type_has_mode_precision_p (scalar_type))
6718     return false;
6719
6720   /* For lane-reducing ops we're reducing the number of reduction PHIs
6721      which means the only use of that may be in the lane-reducing operation.  */
6722   if (lane_reduc_code_p
6723       && reduc_chain_length != 1
6724       && !only_slp_reduc_chain)
6725     {
6726       if (dump_enabled_p ())
6727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6728                          "lane-reducing reduction with extra stmts.\n");
6729       return false;
6730     }
6731
6732   /* All uses but the last are expected to be defined in the loop.
6733      The last use is the reduction variable.  In case of nested cycle this
6734      assumption is not true: we use reduc_index to record the index of the
6735      reduction variable.  */
6736   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6737   /* We need to skip an extra operand for COND_EXPRs with embedded
6738      comparison.  */
6739   unsigned opno_adjust = 0;
6740   if (code == COND_EXPR
6741       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6742     opno_adjust = 1;
6743   for (i = 0; i < op_type; i++)
6744     {
6745       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6746       if (i == 0 && code == COND_EXPR)
6747         continue;
6748
6749       stmt_vec_info def_stmt_info;
6750       enum vect_def_type dt;
6751       tree op;
6752       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6753                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6754                                &def_stmt_info))
6755         {
6756           if (dump_enabled_p ())
6757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758                              "use not simple.\n");
6759           return false;
6760         }
6761       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6762         continue;
6763
6764       /* There should be only one cycle def in the stmt, the one
6765          leading to reduc_def.  */
6766       if (VECTORIZABLE_CYCLE_DEF (dt))
6767         return false;
6768
6769       /* To properly compute ncopies we are interested in the widest
6770          non-reduction input type in case we're looking at a widening
6771          accumulation that we later handle in vect_transform_reduction.  */
6772       if (lane_reduc_code_p
6773           && tem
6774           && (!vectype_in
6775               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6776                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6777         vectype_in = tem;
6778
6779       if (code == COND_EXPR)
6780         {
6781           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6782           if (dt == vect_constant_def)
6783             {
6784               cond_reduc_dt = dt;
6785               cond_reduc_val = op;
6786             }
6787           if (dt == vect_induction_def
6788               && def_stmt_info
6789               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6790             {
6791               cond_reduc_dt = dt;
6792               cond_stmt_vinfo = def_stmt_info;
6793             }
6794         }
6795     }
6796   if (!vectype_in)
6797     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6798   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6799
6800   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6801   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6802   /* If we have a condition reduction, see if we can simplify it further.  */
6803   if (v_reduc_type == COND_REDUCTION)
6804     {
6805       if (slp_node)
6806         return false;
6807
6808       /* When the condition uses the reduction value in the condition, fail.  */
6809       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6810         {
6811           if (dump_enabled_p ())
6812             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6813                              "condition depends on previous iteration\n");
6814           return false;
6815         }
6816
6817       if (reduc_chain_length == 1
6818           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6819                                              vectype_in, OPTIMIZE_FOR_SPEED))
6820         {
6821           if (dump_enabled_p ())
6822             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6823                              "optimizing condition reduction with"
6824                              " FOLD_EXTRACT_LAST.\n");
6825           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6826         }
6827       else if (cond_reduc_dt == vect_induction_def)
6828         {
6829           tree base
6830             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6831           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6832
6833           gcc_assert (TREE_CODE (base) == INTEGER_CST
6834                       && TREE_CODE (step) == INTEGER_CST);
6835           cond_reduc_val = NULL_TREE;
6836           enum tree_code cond_reduc_op_code = ERROR_MARK;
6837           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6838           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6839             ;
6840           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6841              above base; punt if base is the minimum value of the type for
6842              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6843           else if (tree_int_cst_sgn (step) == -1)
6844             {
6845               cond_reduc_op_code = MIN_EXPR;
6846               if (tree_int_cst_sgn (base) == -1)
6847                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6848               else if (tree_int_cst_lt (base,
6849                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6850                 cond_reduc_val
6851                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6852             }
6853           else
6854             {
6855               cond_reduc_op_code = MAX_EXPR;
6856               if (tree_int_cst_sgn (base) == 1)
6857                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6858               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6859                                         base))
6860                 cond_reduc_val
6861                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6862             }
6863           if (cond_reduc_val)
6864             {
6865               if (dump_enabled_p ())
6866                 dump_printf_loc (MSG_NOTE, vect_location,
6867                                  "condition expression based on "
6868                                  "integer induction.\n");
6869               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6870               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6871                 = cond_reduc_val;
6872               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6873             }
6874         }
6875       else if (cond_reduc_dt == vect_constant_def)
6876         {
6877           enum vect_def_type cond_initial_dt;
6878           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6879           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6880           if (cond_initial_dt == vect_constant_def
6881               && types_compatible_p (TREE_TYPE (cond_initial_val),
6882                                      TREE_TYPE (cond_reduc_val)))
6883             {
6884               tree e = fold_binary (LE_EXPR, boolean_type_node,
6885                                     cond_initial_val, cond_reduc_val);
6886               if (e && (integer_onep (e) || integer_zerop (e)))
6887                 {
6888                   if (dump_enabled_p ())
6889                     dump_printf_loc (MSG_NOTE, vect_location,
6890                                      "condition expression based on "
6891                                      "compile time constant.\n");
6892                   /* Record reduction code at analysis stage.  */
6893                   STMT_VINFO_REDUC_CODE (reduc_info)
6894                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6895                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6896                 }
6897             }
6898         }
6899     }
6900
6901   if (STMT_VINFO_LIVE_P (phi_info))
6902     return false;
6903
6904   if (slp_node)
6905     ncopies = 1;
6906   else
6907     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6908
6909   gcc_assert (ncopies >= 1);
6910
6911   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6912
6913   if (nested_cycle)
6914     {
6915       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6916                   == vect_double_reduction_def);
6917       double_reduc = true;
6918     }
6919
6920   /* 4.2. Check support for the epilog operation.
6921
6922           If STMT represents a reduction pattern, then the type of the
6923           reduction variable may be different than the type of the rest
6924           of the arguments.  For example, consider the case of accumulation
6925           of shorts into an int accumulator; The original code:
6926                         S1: int_a = (int) short_a;
6927           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6928
6929           was replaced with:
6930                         STMT: int_acc = widen_sum <short_a, int_acc>
6931
6932           This means that:
6933           1. The tree-code that is used to create the vector operation in the
6934              epilog code (that reduces the partial results) is not the
6935              tree-code of STMT, but is rather the tree-code of the original
6936              stmt from the pattern that STMT is replacing.  I.e, in the example
6937              above we want to use 'widen_sum' in the loop, but 'plus' in the
6938              epilog.
6939           2. The type (mode) we use to check available target support
6940              for the vector operation to be created in the *epilog*, is
6941              determined by the type of the reduction variable (in the example
6942              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6943              However the type (mode) we use to check available target support
6944              for the vector operation to be created *inside the loop*, is
6945              determined by the type of the other arguments to STMT (in the
6946              example we'd check this: optab_handler (widen_sum_optab,
6947              vect_short_mode)).
6948
6949           This is contrary to "regular" reductions, in which the types of all
6950           the arguments are the same as the type of the reduction variable.
6951           For "regular" reductions we can therefore use the same vector type
6952           (and also the same tree-code) when generating the epilog code and
6953           when generating the code inside the loop.  */
6954
6955   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6956   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6957
6958   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6959   if (reduction_type == TREE_CODE_REDUCTION)
6960     {
6961       /* Check whether it's ok to change the order of the computation.
6962          Generally, when vectorizing a reduction we change the order of the
6963          computation.  This may change the behavior of the program in some
6964          cases, so we need to check that this is ok.  One exception is when
6965          vectorizing an outer-loop: the inner-loop is executed sequentially,
6966          and therefore vectorizing reductions in the inner-loop during
6967          outer-loop vectorization is safe.  Likewise when we are vectorizing
6968          a series of reductions using SLP and the VF is one the reductions
6969          are performed in scalar order.  */
6970       if (slp_node
6971           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6972           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6973         ;
6974       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6975         {
6976           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6977              is not directy used in stmt.  */
6978           if (!only_slp_reduc_chain
6979               && reduc_chain_length != 1)
6980             {
6981               if (dump_enabled_p ())
6982                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6983                                  "in-order reduction chain without SLP.\n");
6984               return false;
6985             }
6986           STMT_VINFO_REDUC_TYPE (reduc_info)
6987             = reduction_type = FOLD_LEFT_REDUCTION;
6988         }
6989       else if (!commutative_tree_code (orig_code)
6990                || !associative_tree_code (orig_code))
6991         {
6992           if (dump_enabled_p ())
6993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6994                             "reduction: not commutative/associative");
6995           return false;
6996         }
6997     }
6998
6999   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7000       && ncopies > 1)
7001     {
7002       if (dump_enabled_p ())
7003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004                          "multiple types in double reduction or condition "
7005                          "reduction or fold-left reduction.\n");
7006       return false;
7007     }
7008
7009   internal_fn reduc_fn = IFN_LAST;
7010   if (reduction_type == TREE_CODE_REDUCTION
7011       || reduction_type == FOLD_LEFT_REDUCTION
7012       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7013       || reduction_type == CONST_COND_REDUCTION)
7014     {
7015       if (reduction_type == FOLD_LEFT_REDUCTION
7016           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7017           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7018         {
7019           if (reduc_fn != IFN_LAST
7020               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7021                                                   OPTIMIZE_FOR_SPEED))
7022             {
7023               if (dump_enabled_p ())
7024                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7025                                  "reduc op not supported by target.\n");
7026
7027               reduc_fn = IFN_LAST;
7028             }
7029         }
7030       else
7031         {
7032           if (!nested_cycle || double_reduc)
7033             {
7034               if (dump_enabled_p ())
7035                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7036                                  "no reduc code for scalar code.\n");
7037
7038               return false;
7039             }
7040         }
7041     }
7042   else if (reduction_type == COND_REDUCTION)
7043     {
7044       int scalar_precision
7045         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7046       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7047       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7048                                                 vectype_out);
7049
7050       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7051                                           OPTIMIZE_FOR_SPEED))
7052         reduc_fn = IFN_REDUC_MAX;
7053     }
7054   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7055
7056   if (reduction_type != EXTRACT_LAST_REDUCTION
7057       && (!nested_cycle || double_reduc)
7058       && reduc_fn == IFN_LAST
7059       && !nunits_out.is_constant ())
7060     {
7061       if (dump_enabled_p ())
7062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7063                          "missing target support for reduction on"
7064                          " variable-length vectors.\n");
7065       return false;
7066     }
7067
7068   /* For SLP reductions, see if there is a neutral value we can use.  */
7069   tree neutral_op = NULL_TREE;
7070   if (slp_node)
7071     {
7072       tree initial_value = NULL_TREE;
7073       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7074         initial_value = vect_phi_initial_value (reduc_def_phi);
7075       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7076                                              orig_code, initial_value);
7077     }
7078
7079   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7080     {
7081       /* We can't support in-order reductions of code such as this:
7082
7083            for (int i = 0; i < n1; ++i)
7084              for (int j = 0; j < n2; ++j)
7085                l += a[j];
7086
7087          since GCC effectively transforms the loop when vectorizing:
7088
7089            for (int i = 0; i < n1 / VF; ++i)
7090              for (int j = 0; j < n2; ++j)
7091                for (int k = 0; k < VF; ++k)
7092                  l += a[j];
7093
7094          which is a reassociation of the original operation.  */
7095       if (dump_enabled_p ())
7096         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097                          "in-order double reduction not supported.\n");
7098
7099       return false;
7100     }
7101
7102   if (reduction_type == FOLD_LEFT_REDUCTION
7103       && slp_node
7104       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7105     {
7106       /* We cannot use in-order reductions in this case because there is
7107          an implicit reassociation of the operations involved.  */
7108       if (dump_enabled_p ())
7109         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7110                          "in-order unchained SLP reductions not supported.\n");
7111       return false;
7112     }
7113
7114   /* For double reductions, and for SLP reductions with a neutral value,
7115      we construct a variable-length initial vector by loading a vector
7116      full of the neutral value and then shift-and-inserting the start
7117      values into the low-numbered elements.  */
7118   if ((double_reduc || neutral_op)
7119       && !nunits_out.is_constant ()
7120       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7121                                           vectype_out, OPTIMIZE_FOR_SPEED))
7122     {
7123       if (dump_enabled_p ())
7124         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7125                          "reduction on variable-length vectors requires"
7126                          " target support for a vector-shift-and-insert"
7127                          " operation.\n");
7128       return false;
7129     }
7130
7131   /* Check extra constraints for variable-length unchained SLP reductions.  */
7132   if (STMT_SLP_TYPE (stmt_info)
7133       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7134       && !nunits_out.is_constant ())
7135     {
7136       /* We checked above that we could build the initial vector when
7137          there's a neutral element value.  Check here for the case in
7138          which each SLP statement has its own initial value and in which
7139          that value needs to be repeated for every instance of the
7140          statement within the initial vector.  */
7141       unsigned int group_size = SLP_TREE_LANES (slp_node);
7142       if (!neutral_op
7143           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7144                                               TREE_TYPE (vectype_out)))
7145         {
7146           if (dump_enabled_p ())
7147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7148                              "unsupported form of SLP reduction for"
7149                              " variable-length vectors: cannot build"
7150                              " initial vector.\n");
7151           return false;
7152         }
7153       /* The epilogue code relies on the number of elements being a multiple
7154          of the group size.  The duplicate-and-interleave approach to setting
7155          up the initial vector does too.  */
7156       if (!multiple_p (nunits_out, group_size))
7157         {
7158           if (dump_enabled_p ())
7159             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7160                              "unsupported form of SLP reduction for"
7161                              " variable-length vectors: the vector size"
7162                              " is not a multiple of the number of results.\n");
7163           return false;
7164         }
7165     }
7166
7167   if (reduction_type == COND_REDUCTION)
7168     {
7169       widest_int ni;
7170
7171       if (! max_loop_iterations (loop, &ni))
7172         {
7173           if (dump_enabled_p ())
7174             dump_printf_loc (MSG_NOTE, vect_location,
7175                              "loop count not known, cannot create cond "
7176                              "reduction.\n");
7177           return false;
7178         }
7179       /* Convert backedges to iterations.  */
7180       ni += 1;
7181
7182       /* The additional index will be the same type as the condition.  Check
7183          that the loop can fit into this less one (because we'll use up the
7184          zero slot for when there are no matches).  */
7185       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7186       if (wi::geu_p (ni, wi::to_widest (max_index)))
7187         {
7188           if (dump_enabled_p ())
7189             dump_printf_loc (MSG_NOTE, vect_location,
7190                              "loop size is greater than data size.\n");
7191           return false;
7192         }
7193     }
7194
7195   /* In case the vectorization factor (VF) is bigger than the number
7196      of elements that we can fit in a vectype (nunits), we have to generate
7197      more than one vector stmt - i.e - we need to "unroll" the
7198      vector stmt by a factor VF/nunits.  For more details see documentation
7199      in vectorizable_operation.  */
7200
7201   /* If the reduction is used in an outer loop we need to generate
7202      VF intermediate results, like so (e.g. for ncopies=2):
7203         r0 = phi (init, r0)
7204         r1 = phi (init, r1)
7205         r0 = x0 + r0;
7206         r1 = x1 + r1;
7207     (i.e. we generate VF results in 2 registers).
7208     In this case we have a separate def-use cycle for each copy, and therefore
7209     for each copy we get the vector def for the reduction variable from the
7210     respective phi node created for this copy.
7211
7212     Otherwise (the reduction is unused in the loop nest), we can combine
7213     together intermediate results, like so (e.g. for ncopies=2):
7214         r = phi (init, r)
7215         r = x0 + r;
7216         r = x1 + r;
7217    (i.e. we generate VF/2 results in a single register).
7218    In this case for each copy we get the vector def for the reduction variable
7219    from the vectorized reduction operation generated in the previous iteration.
7220
7221    This only works when we see both the reduction PHI and its only consumer
7222    in vectorizable_reduction and there are no intermediate stmts
7223    participating.  */
7224   if (ncopies > 1
7225       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7226       && reduc_chain_length == 1)
7227     single_defuse_cycle = true;
7228
7229   if (single_defuse_cycle || lane_reduc_code_p)
7230     {
7231       gcc_assert (code != COND_EXPR);
7232
7233       /* 4. Supportable by target?  */
7234       bool ok = true;
7235
7236       /* 4.1. check support for the operation in the loop  */
7237       optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7238       if (!optab)
7239         {
7240           if (dump_enabled_p ())
7241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7242                              "no optab.\n");
7243           ok = false;
7244         }
7245
7246       machine_mode vec_mode = TYPE_MODE (vectype_in);
7247       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7248         {
7249           if (dump_enabled_p ())
7250             dump_printf (MSG_NOTE, "op not supported by target.\n");
7251           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7252               || !vect_can_vectorize_without_simd_p (code))
7253             ok = false;
7254           else
7255             if (dump_enabled_p ())
7256               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7257         }
7258
7259       if (vect_emulated_vector_p (vectype_in)
7260           && !vect_can_vectorize_without_simd_p (code))
7261         {
7262           if (dump_enabled_p ())
7263             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7264           return false;
7265         }
7266
7267       /* lane-reducing operations have to go through vect_transform_reduction.
7268          For the other cases try without the single cycle optimization.  */
7269       if (!ok)
7270         {
7271           if (lane_reduc_code_p)
7272             return false;
7273           else
7274             single_defuse_cycle = false;
7275         }
7276     }
7277   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7278
7279   /* If the reduction stmt is one of the patterns that have lane
7280      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7281   if ((ncopies > 1 && ! single_defuse_cycle)
7282       && lane_reduc_code_p)
7283     {
7284       if (dump_enabled_p ())
7285         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7286                          "multi def-use cycle not possible for lane-reducing "
7287                          "reduction operation\n");
7288       return false;
7289     }
7290
7291   if (slp_node
7292       && !(!single_defuse_cycle
7293            && code != DOT_PROD_EXPR
7294            && code != WIDEN_SUM_EXPR
7295            && code != SAD_EXPR
7296            && reduction_type != FOLD_LEFT_REDUCTION))
7297     for (i = 0; i < op_type; i++)
7298       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7299         {
7300           if (dump_enabled_p ())
7301             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7302                              "incompatible vector types for invariants\n");
7303           return false;
7304         }
7305
7306   if (slp_node)
7307     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7308   else
7309     vec_num = 1;
7310
7311   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7312                              reduction_type, ncopies, cost_vec);
7313   /* Cost the reduction op inside the loop if transformed via
7314      vect_transform_reduction.  Otherwise this is costed by the
7315      separate vectorizable_* routines.  */
7316   if (single_defuse_cycle
7317       || code == DOT_PROD_EXPR
7318       || code == WIDEN_SUM_EXPR
7319       || code == SAD_EXPR)
7320     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7321
7322   if (dump_enabled_p ()
7323       && reduction_type == FOLD_LEFT_REDUCTION)
7324     dump_printf_loc (MSG_NOTE, vect_location,
7325                      "using an in-order (fold-left) reduction.\n");
7326   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7327   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7328      reductions go through their own vectorizable_* routines.  */
7329   if (!single_defuse_cycle
7330       && code != DOT_PROD_EXPR
7331       && code != WIDEN_SUM_EXPR
7332       && code != SAD_EXPR
7333       && reduction_type != FOLD_LEFT_REDUCTION)
7334     {
7335       stmt_vec_info tem
7336         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7337       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7338         {
7339           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7340           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7341         }
7342       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7343       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7344     }
7345   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7346     {
7347       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7348       internal_fn cond_fn = get_conditional_internal_fn (code);
7349
7350       if (reduction_type != FOLD_LEFT_REDUCTION
7351           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7352           && (cond_fn == IFN_LAST
7353               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7354                                                   OPTIMIZE_FOR_SPEED)))
7355         {
7356           if (dump_enabled_p ())
7357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7358                              "can't operate on partial vectors because"
7359                              " no conditional operation is available.\n");
7360           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7361         }
7362       else if (reduction_type == FOLD_LEFT_REDUCTION
7363                && reduc_fn == IFN_LAST
7364                && !expand_vec_cond_expr_p (vectype_in,
7365                                            truth_type_for (vectype_in),
7366                                            SSA_NAME))
7367         {
7368           if (dump_enabled_p ())
7369             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7370                              "can't operate on partial vectors because"
7371                              " no conditional operation is available.\n");
7372           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7373         }
7374       else
7375         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7376                                vectype_in, NULL);
7377     }
7378   return true;
7379 }
7380
7381 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7382    value.  */
7383
7384 bool
7385 vect_transform_reduction (loop_vec_info loop_vinfo,
7386                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7387                           gimple **vec_stmt, slp_tree slp_node)
7388 {
7389   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7390   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7391   int i;
7392   int ncopies;
7393   int vec_num;
7394
7395   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7396   gcc_assert (reduc_info->is_reduc_info);
7397
7398   if (nested_in_vect_loop_p (loop, stmt_info))
7399     {
7400       loop = loop->inner;
7401       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7402     }
7403
7404   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7405   enum tree_code code = gimple_assign_rhs_code (stmt);
7406   int op_type = TREE_CODE_LENGTH (code);
7407
7408   /* Flatten RHS.  */
7409   tree ops[3];
7410   switch (get_gimple_rhs_class (code))
7411     {
7412     case GIMPLE_TERNARY_RHS:
7413       ops[2] = gimple_assign_rhs3 (stmt);
7414       /* Fall thru.  */
7415     case GIMPLE_BINARY_RHS:
7416       ops[0] = gimple_assign_rhs1 (stmt);
7417       ops[1] = gimple_assign_rhs2 (stmt);
7418       break;
7419     default:
7420       gcc_unreachable ();
7421     }
7422
7423   /* All uses but the last are expected to be defined in the loop.
7424      The last use is the reduction variable.  In case of nested cycle this
7425      assumption is not true: we use reduc_index to record the index of the
7426      reduction variable.  */
7427   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7428   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7429   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7430   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7431
7432   if (slp_node)
7433     {
7434       ncopies = 1;
7435       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7436     }
7437   else
7438     {
7439       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7440       vec_num = 1;
7441     }
7442
7443   internal_fn cond_fn = get_conditional_internal_fn (code);
7444   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7445   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7446
7447   /* Transform.  */
7448   tree new_temp = NULL_TREE;
7449   auto_vec<tree> vec_oprnds0;
7450   auto_vec<tree> vec_oprnds1;
7451   auto_vec<tree> vec_oprnds2;
7452   tree def0;
7453
7454   if (dump_enabled_p ())
7455     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7456
7457   /* FORNOW: Multiple types are not supported for condition.  */
7458   if (code == COND_EXPR)
7459     gcc_assert (ncopies == 1);
7460
7461   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7462
7463   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7464   if (reduction_type == FOLD_LEFT_REDUCTION)
7465     {
7466       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7467       return vectorize_fold_left_reduction
7468           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7469            reduc_fn, ops, vectype_in, reduc_index, masks);
7470     }
7471
7472   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7473   gcc_assert (single_defuse_cycle
7474               || code == DOT_PROD_EXPR
7475               || code == WIDEN_SUM_EXPR
7476               || code == SAD_EXPR);
7477
7478   /* Create the destination vector  */
7479   tree scalar_dest = gimple_assign_lhs (stmt);
7480   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7481
7482   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7483                      single_defuse_cycle && reduc_index == 0
7484                      ? NULL_TREE : ops[0], &vec_oprnds0,
7485                      single_defuse_cycle && reduc_index == 1
7486                      ? NULL_TREE : ops[1], &vec_oprnds1,
7487                      op_type == ternary_op
7488                      && !(single_defuse_cycle && reduc_index == 2)
7489                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7490   if (single_defuse_cycle)
7491     {
7492       gcc_assert (!slp_node);
7493       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7494                                      ops[reduc_index],
7495                                      reduc_index == 0 ? &vec_oprnds0
7496                                      : (reduc_index == 1 ? &vec_oprnds1
7497                                         : &vec_oprnds2));
7498     }
7499
7500   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7501     {
7502       gimple *new_stmt;
7503       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7504       if (masked_loop_p && !mask_by_cond_expr)
7505         {
7506           /* Make sure that the reduction accumulator is vop[0].  */
7507           if (reduc_index == 1)
7508             {
7509               gcc_assert (commutative_tree_code (code));
7510               std::swap (vop[0], vop[1]);
7511             }
7512           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7513                                           vectype_in, i);
7514           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7515                                                     vop[0], vop[1], vop[0]);
7516           new_temp = make_ssa_name (vec_dest, call);
7517           gimple_call_set_lhs (call, new_temp);
7518           gimple_call_set_nothrow (call, true);
7519           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7520           new_stmt = call;
7521         }
7522       else
7523         {
7524           if (op_type == ternary_op)
7525             vop[2] = vec_oprnds2[i];
7526
7527           if (masked_loop_p && mask_by_cond_expr)
7528             {
7529               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7530                                               vectype_in, i);
7531               build_vect_cond_expr (code, vop, mask, gsi);
7532             }
7533
7534           new_stmt = gimple_build_assign (vec_dest, code,
7535                                           vop[0], vop[1], vop[2]);
7536           new_temp = make_ssa_name (vec_dest, new_stmt);
7537           gimple_assign_set_lhs (new_stmt, new_temp);
7538           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7539         }
7540
7541       if (slp_node)
7542         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7543       else if (single_defuse_cycle
7544                && i < ncopies - 1)
7545         {
7546           if (reduc_index == 0)
7547             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7548           else if (reduc_index == 1)
7549             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7550           else if (reduc_index == 2)
7551             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7552         }
7553       else
7554         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7555     }
7556
7557   if (!slp_node)
7558     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7559
7560   return true;
7561 }
7562
7563 /* Transform phase of a cycle PHI.  */
7564
7565 bool
7566 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7567                           stmt_vec_info stmt_info, gimple **vec_stmt,
7568                           slp_tree slp_node, slp_instance slp_node_instance)
7569 {
7570   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7571   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7572   int i;
7573   int ncopies;
7574   int j;
7575   bool nested_cycle = false;
7576   int vec_num;
7577
7578   if (nested_in_vect_loop_p (loop, stmt_info))
7579     {
7580       loop = loop->inner;
7581       nested_cycle = true;
7582     }
7583
7584   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7585   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7586   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7587   gcc_assert (reduc_info->is_reduc_info);
7588
7589   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7590       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7591     /* Leave the scalar phi in place.  */
7592     return true;
7593
7594   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7595   /* For a nested cycle we do not fill the above.  */
7596   if (!vectype_in)
7597     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7598   gcc_assert (vectype_in);
7599
7600   if (slp_node)
7601     {
7602       /* The size vect_schedule_slp_instance computes is off for us.  */
7603       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7604                                       * SLP_TREE_LANES (slp_node), vectype_in);
7605       ncopies = 1;
7606     }
7607   else
7608     {
7609       vec_num = 1;
7610       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7611     }
7612
7613   /* Check whether we should use a single PHI node and accumulate
7614      vectors to one before the backedge.  */
7615   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7616     ncopies = 1;
7617
7618   /* Create the destination vector  */
7619   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7620   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7621                                                vectype_out);
7622
7623   /* Get the loop-entry arguments.  */
7624   tree vec_initial_def = NULL_TREE;
7625   auto_vec<tree> vec_initial_defs;
7626   if (slp_node)
7627     {
7628       vec_initial_defs.reserve (vec_num);
7629       if (nested_cycle)
7630         {
7631           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7632           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7633                              &vec_initial_defs);
7634         }
7635       else
7636         {
7637           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7638           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7639           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7640
7641           unsigned int num_phis = stmts.length ();
7642           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7643             num_phis = 1;
7644           initial_values.reserve (num_phis);
7645           for (unsigned int i = 0; i < num_phis; ++i)
7646             {
7647               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7648               initial_values.quick_push (vect_phi_initial_value (this_phi));
7649             }
7650           if (vec_num == 1)
7651             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7652           if (!initial_values.is_empty ())
7653             {
7654               tree initial_value
7655                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7656               tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7657               tree neutral_op
7658                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7659                                             code, initial_value);
7660               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7661                                               &vec_initial_defs, vec_num,
7662                                               stmts.length (), neutral_op);
7663             }
7664         }
7665     }
7666   else
7667     {
7668       /* Get at the scalar def before the loop, that defines the initial
7669          value of the reduction variable.  */
7670       tree initial_def = vect_phi_initial_value (phi);
7671       reduc_info->reduc_initial_values.safe_push (initial_def);
7672       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7673          and we can't use zero for induc_val, use initial_def.  Similarly
7674          for REDUC_MIN and initial_def larger than the base.  */
7675       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7676         {
7677           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7678           if (TREE_CODE (initial_def) == INTEGER_CST
7679               && !integer_zerop (induc_val)
7680               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7681                    && tree_int_cst_lt (initial_def, induc_val))
7682                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7683                       && tree_int_cst_lt (induc_val, initial_def))))
7684             {
7685               induc_val = initial_def;
7686               /* Communicate we used the initial_def to epilouge
7687                  generation.  */
7688               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7689             }
7690           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7691         }
7692       else if (nested_cycle)
7693         {
7694           /* Do not use an adjustment def as that case is not supported
7695              correctly if ncopies is not one.  */
7696           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7697                                          ncopies, initial_def,
7698                                          &vec_initial_defs);
7699         }
7700       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7701                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7702         /* Fill the initial vector with the initial scalar value.  */
7703         vec_initial_def
7704           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7705                                            initial_def, initial_def);
7706       else
7707         {
7708           if (ncopies == 1)
7709             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7710           if (!reduc_info->reduc_initial_values.is_empty ())
7711             {
7712               initial_def = reduc_info->reduc_initial_values[0];
7713               enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7714               tree neutral_op
7715                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7716                                             code, initial_def);
7717               gcc_assert (neutral_op);
7718               /* Try to simplify the vector initialization by applying an
7719                  adjustment after the reduction has been performed.  */
7720               if (!reduc_info->reused_accumulator
7721                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7722                   && !operand_equal_p (neutral_op, initial_def))
7723                 {
7724                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7725                     = initial_def;
7726                   initial_def = neutral_op;
7727                 }
7728               vec_initial_def
7729                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7730                                                  initial_def, neutral_op);
7731             }
7732         }
7733     }
7734
7735   if (vec_initial_def)
7736     {
7737       vec_initial_defs.create (ncopies);
7738       for (i = 0; i < ncopies; ++i)
7739         vec_initial_defs.quick_push (vec_initial_def);
7740     }
7741
7742   if (auto *accumulator = reduc_info->reused_accumulator)
7743     {
7744       tree def = accumulator->reduc_input;
7745       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7746         {
7747           unsigned int nreduc;
7748           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7749                                             (TREE_TYPE (def)),
7750                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7751                                           &nreduc);
7752           gcc_assert (res);
7753           gimple_seq stmts = NULL;
7754           /* Reduce the single vector to a smaller one.  */
7755           if (nreduc != 1)
7756             {
7757               /* Perform the reduction in the appropriate type.  */
7758               tree rvectype = vectype_out;
7759               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7760                                               TREE_TYPE (TREE_TYPE (def))))
7761                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7762                                               TYPE_VECTOR_SUBPARTS
7763                                                 (vectype_out));
7764               def = vect_create_partial_epilog (def, rvectype,
7765                                                 STMT_VINFO_REDUC_CODE
7766                                                   (reduc_info),
7767                                                 &stmts);
7768             }
7769           /* The epilogue loop might use a different vector mode, like
7770              VNx2DI vs. V2DI.  */
7771           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7772             {
7773               tree reduc_type = build_vector_type_for_mode
7774                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7775               def = gimple_convert (&stmts, reduc_type, def);
7776             }
7777           /* Adjust the input so we pick up the partially reduced value
7778              for the skip edge in vect_create_epilog_for_reduction.  */
7779           accumulator->reduc_input = def;
7780           /* And the reduction could be carried out using a different sign.  */
7781           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7782             def = gimple_convert (&stmts, vectype_out, def);
7783           if (loop_vinfo->main_loop_edge)
7784             {
7785               /* While we'd like to insert on the edge this will split
7786                  blocks and disturb bookkeeping, we also will eventually
7787                  need this on the skip edge.  Rely on sinking to
7788                  fixup optimal placement and insert in the pred.  */
7789               gimple_stmt_iterator gsi
7790                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7791               /* Insert before a cond that eventually skips the
7792                  epilogue.  */
7793               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7794                 gsi_prev (&gsi);
7795               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7796             }
7797           else
7798             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7799                                               stmts);
7800         }
7801       if (loop_vinfo->main_loop_edge)
7802         vec_initial_defs[0]
7803           = vect_get_main_loop_result (loop_vinfo, def,
7804                                        vec_initial_defs[0]);
7805       else
7806         vec_initial_defs.safe_push (def);
7807     }
7808
7809   /* Generate the reduction PHIs upfront.  */
7810   for (i = 0; i < vec_num; i++)
7811     {
7812       tree vec_init_def = vec_initial_defs[i];
7813       for (j = 0; j < ncopies; j++)
7814         {
7815           /* Create the reduction-phi that defines the reduction
7816              operand.  */
7817           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7818
7819           /* Set the loop-entry arg of the reduction-phi.  */
7820           if (j != 0 && nested_cycle)
7821             vec_init_def = vec_initial_defs[j];
7822           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7823                        UNKNOWN_LOCATION);
7824
7825           /* The loop-latch arg is set in epilogue processing.  */
7826
7827           if (slp_node)
7828             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7829           else
7830             {
7831               if (j == 0)
7832                 *vec_stmt = new_phi;
7833               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7834             }
7835         }
7836     }
7837
7838   return true;
7839 }
7840
7841 /* Vectorizes LC PHIs.  */
7842
7843 bool
7844 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7845                      stmt_vec_info stmt_info, gimple **vec_stmt,
7846                      slp_tree slp_node)
7847 {
7848   if (!loop_vinfo
7849       || !is_a <gphi *> (stmt_info->stmt)
7850       || gimple_phi_num_args (stmt_info->stmt) != 1)
7851     return false;
7852
7853   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7854       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7855     return false;
7856
7857   if (!vec_stmt) /* transformation not required.  */
7858     {
7859       /* Deal with copies from externs or constants that disguise as
7860          loop-closed PHI nodes (PR97886).  */
7861       if (slp_node
7862           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7863                                                 SLP_TREE_VECTYPE (slp_node)))
7864         {
7865           if (dump_enabled_p ())
7866             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867                              "incompatible vector types for invariants\n");
7868           return false;
7869         }
7870       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7871       return true;
7872     }
7873
7874   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7875   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7876   basic_block bb = gimple_bb (stmt_info->stmt);
7877   edge e = single_pred_edge (bb);
7878   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7879   auto_vec<tree> vec_oprnds;
7880   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7881                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7882                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7883   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7884     {
7885       /* Create the vectorized LC PHI node.  */
7886       gphi *new_phi = create_phi_node (vec_dest, bb);
7887       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7888       if (slp_node)
7889         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7890       else
7891         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7892     }
7893   if (!slp_node)
7894     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7895
7896   return true;
7897 }
7898
7899 /* Vectorizes PHIs.  */
7900
7901 bool
7902 vectorizable_phi (vec_info *,
7903                   stmt_vec_info stmt_info, gimple **vec_stmt,
7904                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7905 {
7906   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7907     return false;
7908
7909   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7910     return false;
7911
7912   tree vectype = SLP_TREE_VECTYPE (slp_node);
7913
7914   if (!vec_stmt) /* transformation not required.  */
7915     {
7916       slp_tree child;
7917       unsigned i;
7918       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7919         if (!child)
7920           {
7921             if (dump_enabled_p ())
7922               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7923                                "PHI node with unvectorized backedge def\n");
7924             return false;
7925           }
7926         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7927           {
7928             if (dump_enabled_p ())
7929               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7930                                "incompatible vector types for invariants\n");
7931             return false;
7932           }
7933       /* For single-argument PHIs assume coalescing which means zero cost
7934          for the scalar and the vector PHIs.  This avoids artificially
7935          favoring the vector path (but may pessimize it in some cases).  */
7936       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7937         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7938                           vector_stmt, stmt_info, vectype, 0, vect_body);
7939       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7940       return true;
7941     }
7942
7943   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7944   basic_block bb = gimple_bb (stmt_info->stmt);
7945   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7946   auto_vec<gphi *> new_phis;
7947   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7948     {
7949       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7950
7951       /* Skip not yet vectorized defs.  */
7952       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7953           && SLP_TREE_VEC_STMTS (child).is_empty ())
7954         continue;
7955
7956       auto_vec<tree> vec_oprnds;
7957       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7958       if (!new_phis.exists ())
7959         {
7960           new_phis.create (vec_oprnds.length ());
7961           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7962             {
7963               /* Create the vectorized LC PHI node.  */
7964               new_phis.quick_push (create_phi_node (vec_dest, bb));
7965               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7966             }
7967         }
7968       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7969       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7970         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7971     }
7972   /* We should have at least one already vectorized child.  */
7973   gcc_assert (new_phis.exists ());
7974
7975   return true;
7976 }
7977
7978 /* Return true if VECTYPE represents a vector that requires lowering
7979    by the vector lowering pass.  */
7980
7981 bool
7982 vect_emulated_vector_p (tree vectype)
7983 {
7984   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7985           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7986               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7987 }
7988
7989 /* Return true if we can emulate CODE on an integer mode representation
7990    of a vector.  */
7991
7992 bool
7993 vect_can_vectorize_without_simd_p (tree_code code)
7994 {
7995   switch (code)
7996     {
7997     case PLUS_EXPR:
7998     case MINUS_EXPR:
7999     case NEGATE_EXPR:
8000     case BIT_AND_EXPR:
8001     case BIT_IOR_EXPR:
8002     case BIT_XOR_EXPR:
8003     case BIT_NOT_EXPR:
8004       return true;
8005
8006     default:
8007       return false;
8008     }
8009 }
8010
8011 /* Function vectorizable_induction
8012
8013    Check if STMT_INFO performs an induction computation that can be vectorized.
8014    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8015    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8016    Return true if STMT_INFO is vectorizable in this way.  */
8017
8018 bool
8019 vectorizable_induction (loop_vec_info loop_vinfo,
8020                         stmt_vec_info stmt_info,
8021                         gimple **vec_stmt, slp_tree slp_node,
8022                         stmt_vector_for_cost *cost_vec)
8023 {
8024   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8025   unsigned ncopies;
8026   bool nested_in_vect_loop = false;
8027   class loop *iv_loop;
8028   tree vec_def;
8029   edge pe = loop_preheader_edge (loop);
8030   basic_block new_bb;
8031   tree new_vec, vec_init, vec_step, t;
8032   tree new_name;
8033   gimple *new_stmt;
8034   gphi *induction_phi;
8035   tree induc_def, vec_dest;
8036   tree init_expr, step_expr;
8037   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8038   unsigned i;
8039   tree expr;
8040   gimple_stmt_iterator si;
8041
8042   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8043   if (!phi)
8044     return false;
8045
8046   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8047     return false;
8048
8049   /* Make sure it was recognized as induction computation.  */
8050   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8051     return false;
8052
8053   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8054   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8055
8056   if (slp_node)
8057     ncopies = 1;
8058   else
8059     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8060   gcc_assert (ncopies >= 1);
8061
8062   /* FORNOW. These restrictions should be relaxed.  */
8063   if (nested_in_vect_loop_p (loop, stmt_info))
8064     {
8065       imm_use_iterator imm_iter;
8066       use_operand_p use_p;
8067       gimple *exit_phi;
8068       edge latch_e;
8069       tree loop_arg;
8070
8071       if (ncopies > 1)
8072         {
8073           if (dump_enabled_p ())
8074             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8075                              "multiple types in nested loop.\n");
8076           return false;
8077         }
8078
8079       exit_phi = NULL;
8080       latch_e = loop_latch_edge (loop->inner);
8081       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8082       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8083         {
8084           gimple *use_stmt = USE_STMT (use_p);
8085           if (is_gimple_debug (use_stmt))
8086             continue;
8087
8088           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8089             {
8090               exit_phi = use_stmt;
8091               break;
8092             }
8093         }
8094       if (exit_phi)
8095         {
8096           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8097           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8098                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8099             {
8100               if (dump_enabled_p ())
8101                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8102                                  "inner-loop induction only used outside "
8103                                  "of the outer vectorized loop.\n");
8104               return false;
8105             }
8106         }
8107
8108       nested_in_vect_loop = true;
8109       iv_loop = loop->inner;
8110     }
8111   else
8112     iv_loop = loop;
8113   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8114
8115   if (slp_node && !nunits.is_constant ())
8116     {
8117       /* The current SLP code creates the step value element-by-element.  */
8118       if (dump_enabled_p ())
8119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8120                          "SLP induction not supported for variable-length"
8121                          " vectors.\n");
8122       return false;
8123     }
8124
8125   if (!vec_stmt) /* transformation not required.  */
8126     {
8127       unsigned inside_cost = 0, prologue_cost = 0;
8128       if (slp_node)
8129         {
8130           /* We eventually need to set a vector type on invariant
8131              arguments.  */
8132           unsigned j;
8133           slp_tree child;
8134           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8135             if (!vect_maybe_update_slp_op_vectype
8136                 (child, SLP_TREE_VECTYPE (slp_node)))
8137               {
8138                 if (dump_enabled_p ())
8139                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8140                                    "incompatible vector types for "
8141                                    "invariants\n");
8142                 return false;
8143               }
8144           /* loop cost for vec_loop.  */
8145           inside_cost
8146             = record_stmt_cost (cost_vec,
8147                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8148                                 vector_stmt, stmt_info, 0, vect_body);
8149           /* prologue cost for vec_init (if not nested) and step.  */
8150           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8151                                             scalar_to_vec,
8152                                             stmt_info, 0, vect_prologue);
8153         }
8154       else /* if (!slp_node) */
8155         {
8156           /* loop cost for vec_loop.  */
8157           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8158                                           stmt_info, 0, vect_body);
8159           /* prologue cost for vec_init and vec_step.  */
8160           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8161                                             stmt_info, 0, vect_prologue);
8162         }
8163       if (dump_enabled_p ())
8164         dump_printf_loc (MSG_NOTE, vect_location,
8165                          "vect_model_induction_cost: inside_cost = %d, "
8166                          "prologue_cost = %d .\n", inside_cost,
8167                          prologue_cost);
8168
8169       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8170       DUMP_VECT_SCOPE ("vectorizable_induction");
8171       return true;
8172     }
8173
8174   /* Transform.  */
8175
8176   /* Compute a vector variable, initialized with the first VF values of
8177      the induction variable.  E.g., for an iv with IV_PHI='X' and
8178      evolution S, for a vector of 4 units, we want to compute:
8179      [X, X + S, X + 2*S, X + 3*S].  */
8180
8181   if (dump_enabled_p ())
8182     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8183
8184   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8185   gcc_assert (step_expr != NULL_TREE);
8186   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8187
8188   pe = loop_preheader_edge (iv_loop);
8189   /* Find the first insertion point in the BB.  */
8190   basic_block bb = gimple_bb (phi);
8191   si = gsi_after_labels (bb);
8192
8193   /* For SLP induction we have to generate several IVs as for example
8194      with group size 3 we need
8195        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8196        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8197   if (slp_node)
8198     {
8199       /* Enforced above.  */
8200       unsigned int const_nunits = nunits.to_constant ();
8201
8202       /* The initial values are vectorized, but any lanes > group_size
8203          need adjustment.  */
8204       slp_tree init_node
8205         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8206
8207       /* Gather steps.  Since we do not vectorize inductions as
8208          cycles we have to reconstruct the step from SCEV data.  */
8209       unsigned group_size = SLP_TREE_LANES (slp_node);
8210       tree *steps = XALLOCAVEC (tree, group_size);
8211       tree *inits = XALLOCAVEC (tree, group_size);
8212       stmt_vec_info phi_info;
8213       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8214         {
8215           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8216           if (!init_node)
8217             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8218                                            pe->dest_idx);
8219         }
8220
8221       /* Now generate the IVs.  */
8222       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8223       gcc_assert ((const_nunits * nvects) % group_size == 0);
8224       unsigned nivs;
8225       if (nested_in_vect_loop)
8226         nivs = nvects;
8227       else
8228         {
8229           /* Compute the number of distinct IVs we need.  First reduce
8230              group_size if it is a multiple of const_nunits so we get
8231              one IV for a group_size of 4 but const_nunits 2.  */
8232           unsigned group_sizep = group_size;
8233           if (group_sizep % const_nunits == 0)
8234             group_sizep = group_sizep / const_nunits;
8235           nivs = least_common_multiple (group_sizep,
8236                                         const_nunits) / const_nunits;
8237         }
8238       tree stept = TREE_TYPE (step_vectype);
8239       tree lupdate_mul = NULL_TREE;
8240       if (!nested_in_vect_loop)
8241         {
8242           /* The number of iterations covered in one vector iteration.  */
8243           unsigned lup_mul = (nvects * const_nunits) / group_size;
8244           lupdate_mul
8245             = build_vector_from_val (step_vectype,
8246                                      SCALAR_FLOAT_TYPE_P (stept)
8247                                      ? build_real_from_wide (stept, lup_mul,
8248                                                              UNSIGNED)
8249                                      : build_int_cstu (stept, lup_mul));
8250         }
8251       tree peel_mul = NULL_TREE;
8252       gimple_seq init_stmts = NULL;
8253       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8254         {
8255           if (SCALAR_FLOAT_TYPE_P (stept))
8256             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8257                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8258           else
8259             peel_mul = gimple_convert (&init_stmts, stept,
8260                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8261           peel_mul = gimple_build_vector_from_val (&init_stmts,
8262                                                    step_vectype, peel_mul);
8263         }
8264       unsigned ivn;
8265       auto_vec<tree> vec_steps;
8266       for (ivn = 0; ivn < nivs; ++ivn)
8267         {
8268           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8269           tree_vector_builder init_elts (vectype, const_nunits, 1);
8270           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8271           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8272             {
8273               /* The scalar steps of the IVs.  */
8274               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8275               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8276               step_elts.quick_push (elt);
8277               if (!init_node)
8278                 {
8279                   /* The scalar inits of the IVs if not vectorized.  */
8280                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8281                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8282                                                   TREE_TYPE (elt)))
8283                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8284                                         TREE_TYPE (vectype), elt);
8285                   init_elts.quick_push (elt);
8286                 }
8287               /* The number of steps to add to the initial values.  */
8288               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8289               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8290                                    ? build_real_from_wide (stept,
8291                                                            mul_elt, UNSIGNED)
8292                                    : build_int_cstu (stept, mul_elt));
8293             }
8294           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8295           vec_steps.safe_push (vec_step);
8296           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8297           if (peel_mul)
8298             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8299                                      step_mul, peel_mul);
8300           if (!init_node)
8301             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8302
8303           /* Create the induction-phi that defines the induction-operand.  */
8304           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8305                                             "vec_iv_");
8306           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8307           induc_def = PHI_RESULT (induction_phi);
8308
8309           /* Create the iv update inside the loop  */
8310           tree up = vec_step;
8311           if (lupdate_mul)
8312             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8313                                vec_step, lupdate_mul);
8314           gimple_seq stmts = NULL;
8315           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8316           vec_def = gimple_build (&stmts,
8317                                   PLUS_EXPR, step_vectype, vec_def, up);
8318           vec_def = gimple_convert (&stmts, vectype, vec_def);
8319           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8320           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8321                        UNKNOWN_LOCATION);
8322
8323           if (init_node)
8324             vec_init = vect_get_slp_vect_def (init_node, ivn);
8325           if (!nested_in_vect_loop
8326               && !integer_zerop (step_mul))
8327             {
8328               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8329               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8330                                  vec_step, step_mul);
8331               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8332                                       vec_def, up);
8333               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8334             }
8335
8336           /* Set the arguments of the phi node:  */
8337           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8338
8339           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8340         }
8341       if (!nested_in_vect_loop)
8342         {
8343           /* Fill up to the number of vectors we need for the whole group.  */
8344           nivs = least_common_multiple (group_size,
8345                                         const_nunits) / const_nunits;
8346           vec_steps.reserve (nivs-ivn);
8347           for (; ivn < nivs; ++ivn)
8348             {
8349               SLP_TREE_VEC_STMTS (slp_node)
8350                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8351               vec_steps.quick_push (vec_steps[0]);
8352             }
8353         }
8354
8355       /* Re-use IVs when we can.  We are generating further vector
8356          stmts by adding VF' * stride to the IVs generated above.  */
8357       if (ivn < nvects)
8358         {
8359           unsigned vfp
8360             = least_common_multiple (group_size, const_nunits) / group_size;
8361           tree lupdate_mul
8362             = build_vector_from_val (step_vectype,
8363                                      SCALAR_FLOAT_TYPE_P (stept)
8364                                      ? build_real_from_wide (stept,
8365                                                              vfp, UNSIGNED)
8366                                      : build_int_cstu (stept, vfp));
8367           for (; ivn < nvects; ++ivn)
8368             {
8369               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8370               tree def = gimple_get_lhs (iv);
8371               if (ivn < 2*nivs)
8372                 vec_steps[ivn - nivs]
8373                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8374                                   vec_steps[ivn - nivs], lupdate_mul);
8375               gimple_seq stmts = NULL;
8376               def = gimple_convert (&stmts, step_vectype, def);
8377               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8378                                   def, vec_steps[ivn % nivs]);
8379               def = gimple_convert (&stmts, vectype, def);
8380               if (gimple_code (iv) == GIMPLE_PHI)
8381                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8382               else
8383                 {
8384                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8385                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8386                 }
8387               SLP_TREE_VEC_STMTS (slp_node)
8388                 .quick_push (SSA_NAME_DEF_STMT (def));
8389             }
8390         }
8391
8392       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8393       gcc_assert (!new_bb);
8394
8395       return true;
8396     }
8397
8398   init_expr = vect_phi_initial_value (phi);
8399
8400   gimple_seq stmts = NULL;
8401   if (!nested_in_vect_loop)
8402     {
8403       /* Convert the initial value to the IV update type.  */
8404       tree new_type = TREE_TYPE (step_expr);
8405       init_expr = gimple_convert (&stmts, new_type, init_expr);
8406
8407       /* If we are using the loop mask to "peel" for alignment then we need
8408          to adjust the start value here.  */
8409       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8410       if (skip_niters != NULL_TREE)
8411         {
8412           if (FLOAT_TYPE_P (vectype))
8413             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8414                                         skip_niters);
8415           else
8416             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8417           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8418                                          skip_niters, step_expr);
8419           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8420                                     init_expr, skip_step);
8421         }
8422     }
8423
8424   if (stmts)
8425     {
8426       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8427       gcc_assert (!new_bb);
8428     }
8429
8430   /* Create the vector that holds the initial_value of the induction.  */
8431   if (nested_in_vect_loop)
8432     {
8433       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8434          been created during vectorization of previous stmts.  We obtain it
8435          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8436       auto_vec<tree> vec_inits;
8437       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8438                                      init_expr, &vec_inits);
8439       vec_init = vec_inits[0];
8440       /* If the initial value is not of proper type, convert it.  */
8441       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8442         {
8443           new_stmt
8444             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8445                                                           vect_simple_var,
8446                                                           "vec_iv_"),
8447                                    VIEW_CONVERT_EXPR,
8448                                    build1 (VIEW_CONVERT_EXPR, vectype,
8449                                            vec_init));
8450           vec_init = gimple_assign_lhs (new_stmt);
8451           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8452                                                  new_stmt);
8453           gcc_assert (!new_bb);
8454         }
8455     }
8456   else
8457     {
8458       /* iv_loop is the loop to be vectorized. Create:
8459          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8460       stmts = NULL;
8461       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8462
8463       unsigned HOST_WIDE_INT const_nunits;
8464       if (nunits.is_constant (&const_nunits))
8465         {
8466           tree_vector_builder elts (step_vectype, const_nunits, 1);
8467           elts.quick_push (new_name);
8468           for (i = 1; i < const_nunits; i++)
8469             {
8470               /* Create: new_name_i = new_name + step_expr  */
8471               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8472                                        new_name, step_expr);
8473               elts.quick_push (new_name);
8474             }
8475           /* Create a vector from [new_name_0, new_name_1, ...,
8476              new_name_nunits-1]  */
8477           vec_init = gimple_build_vector (&stmts, &elts);
8478         }
8479       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8480         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8481         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8482                                  new_name, step_expr);
8483       else
8484         {
8485           /* Build:
8486                 [base, base, base, ...]
8487                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8488           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8489           gcc_assert (flag_associative_math);
8490           tree index = build_index_vector (step_vectype, 0, 1);
8491           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8492                                                         new_name);
8493           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8494                                                         step_expr);
8495           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8496           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8497                                    vec_init, step_vec);
8498           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8499                                    vec_init, base_vec);
8500         }
8501       vec_init = gimple_convert (&stmts, vectype, vec_init);
8502
8503       if (stmts)
8504         {
8505           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8506           gcc_assert (!new_bb);
8507         }
8508     }
8509
8510
8511   /* Create the vector that holds the step of the induction.  */
8512   if (nested_in_vect_loop)
8513     /* iv_loop is nested in the loop to be vectorized. Generate:
8514        vec_step = [S, S, S, S]  */
8515     new_name = step_expr;
8516   else
8517     {
8518       /* iv_loop is the loop to be vectorized. Generate:
8519           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8520       gimple_seq seq = NULL;
8521       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8522         {
8523           expr = build_int_cst (integer_type_node, vf);
8524           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8525         }
8526       else
8527         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8528       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8529                                expr, step_expr);
8530       if (seq)
8531         {
8532           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8533           gcc_assert (!new_bb);
8534         }
8535     }
8536
8537   t = unshare_expr (new_name);
8538   gcc_assert (CONSTANT_CLASS_P (new_name)
8539               || TREE_CODE (new_name) == SSA_NAME);
8540   new_vec = build_vector_from_val (step_vectype, t);
8541   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8542                                new_vec, step_vectype, NULL);
8543
8544
8545   /* Create the following def-use cycle:
8546      loop prolog:
8547          vec_init = ...
8548          vec_step = ...
8549      loop:
8550          vec_iv = PHI <vec_init, vec_loop>
8551          ...
8552          STMT
8553          ...
8554          vec_loop = vec_iv + vec_step;  */
8555
8556   /* Create the induction-phi that defines the induction-operand.  */
8557   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8558   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8559   induc_def = PHI_RESULT (induction_phi);
8560
8561   /* Create the iv update inside the loop  */
8562   stmts = NULL;
8563   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8564   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8565   vec_def = gimple_convert (&stmts, vectype, vec_def);
8566   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8567   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8568
8569   /* Set the arguments of the phi node:  */
8570   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8571   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8572                UNKNOWN_LOCATION);
8573
8574   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8575   *vec_stmt = induction_phi;
8576
8577   /* In case that vectorization factor (VF) is bigger than the number
8578      of elements that we can fit in a vectype (nunits), we have to generate
8579      more than one vector stmt - i.e - we need to "unroll" the
8580      vector stmt by a factor VF/nunits.  For more details see documentation
8581      in vectorizable_operation.  */
8582
8583   if (ncopies > 1)
8584     {
8585       gimple_seq seq = NULL;
8586       /* FORNOW. This restriction should be relaxed.  */
8587       gcc_assert (!nested_in_vect_loop);
8588
8589       /* Create the vector that holds the step of the induction.  */
8590       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8591         {
8592           expr = build_int_cst (integer_type_node, nunits);
8593           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8594         }
8595       else
8596         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8597       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8598                                expr, step_expr);
8599       if (seq)
8600         {
8601           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8602           gcc_assert (!new_bb);
8603         }
8604
8605       t = unshare_expr (new_name);
8606       gcc_assert (CONSTANT_CLASS_P (new_name)
8607                   || TREE_CODE (new_name) == SSA_NAME);
8608       new_vec = build_vector_from_val (step_vectype, t);
8609       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8610                                    new_vec, step_vectype, NULL);
8611
8612       vec_def = induc_def;
8613       for (i = 1; i < ncopies; i++)
8614         {
8615           /* vec_i = vec_prev + vec_step  */
8616           gimple_seq stmts = NULL;
8617           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8618           vec_def = gimple_build (&stmts,
8619                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8620           vec_def = gimple_convert (&stmts, vectype, vec_def);
8621
8622           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8623           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8624           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8625         }
8626     }
8627
8628   if (dump_enabled_p ())
8629     dump_printf_loc (MSG_NOTE, vect_location,
8630                      "transform induction: created def-use cycle: %G%G",
8631                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8632
8633   return true;
8634 }
8635
8636 /* Function vectorizable_live_operation.
8637
8638    STMT_INFO computes a value that is used outside the loop.  Check if
8639    it can be supported.  */
8640
8641 bool
8642 vectorizable_live_operation (vec_info *vinfo,
8643                              stmt_vec_info stmt_info,
8644                              gimple_stmt_iterator *gsi,
8645                              slp_tree slp_node, slp_instance slp_node_instance,
8646                              int slp_index, bool vec_stmt_p,
8647                              stmt_vector_for_cost *cost_vec)
8648 {
8649   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8650   imm_use_iterator imm_iter;
8651   tree lhs, lhs_type, bitsize;
8652   tree vectype = (slp_node
8653                   ? SLP_TREE_VECTYPE (slp_node)
8654                   : STMT_VINFO_VECTYPE (stmt_info));
8655   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8656   int ncopies;
8657   gimple *use_stmt;
8658   auto_vec<tree> vec_oprnds;
8659   int vec_entry = 0;
8660   poly_uint64 vec_index = 0;
8661
8662   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8663
8664   /* If a stmt of a reduction is live, vectorize it via
8665      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8666      validity so just trigger the transform here.  */
8667   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8668     {
8669       if (!vec_stmt_p)
8670         return true;
8671       if (slp_node)
8672         {
8673           /* For reduction chains the meta-info is attached to
8674              the group leader.  */
8675           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8676             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8677           /* For SLP reductions we vectorize the epilogue for
8678              all involved stmts together.  */
8679           else if (slp_index != 0)
8680             return true;
8681           else
8682             /* For SLP reductions the meta-info is attached to
8683                the representative.  */
8684             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8685         }
8686       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8687       gcc_assert (reduc_info->is_reduc_info);
8688       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8689           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8690         return true;
8691       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8692                                         slp_node_instance);
8693       return true;
8694     }
8695
8696   /* If STMT is not relevant and it is a simple assignment and its inputs are
8697      invariant then it can remain in place, unvectorized.  The original last
8698      scalar value that it computes will be used.  */
8699   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8700     {
8701       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8702       if (dump_enabled_p ())
8703         dump_printf_loc (MSG_NOTE, vect_location,
8704                          "statement is simple and uses invariant.  Leaving in "
8705                          "place.\n");
8706       return true;
8707     }
8708
8709   if (slp_node)
8710     ncopies = 1;
8711   else
8712     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8713
8714   if (slp_node)
8715     {
8716       gcc_assert (slp_index >= 0);
8717
8718       /* Get the last occurrence of the scalar index from the concatenation of
8719          all the slp vectors. Calculate which slp vector it is and the index
8720          within.  */
8721       int num_scalar = SLP_TREE_LANES (slp_node);
8722       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8723       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8724
8725       /* Calculate which vector contains the result, and which lane of
8726          that vector we need.  */
8727       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8728         {
8729           if (dump_enabled_p ())
8730             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8731                              "Cannot determine which vector holds the"
8732                              " final result.\n");
8733           return false;
8734         }
8735     }
8736
8737   if (!vec_stmt_p)
8738     {
8739       /* No transformation required.  */
8740       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8741         {
8742           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8743                                                OPTIMIZE_FOR_SPEED))
8744             {
8745               if (dump_enabled_p ())
8746                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8747                                  "can't operate on partial vectors "
8748                                  "because the target doesn't support extract "
8749                                  "last reduction.\n");
8750               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8751             }
8752           else if (slp_node)
8753             {
8754               if (dump_enabled_p ())
8755                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8756                                  "can't operate on partial vectors "
8757                                  "because an SLP statement is live after "
8758                                  "the loop.\n");
8759               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8760             }
8761           else if (ncopies > 1)
8762             {
8763               if (dump_enabled_p ())
8764                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8765                                  "can't operate on partial vectors "
8766                                  "because ncopies is greater than 1.\n");
8767               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8768             }
8769           else
8770             {
8771               gcc_assert (ncopies == 1 && !slp_node);
8772               vect_record_loop_mask (loop_vinfo,
8773                                      &LOOP_VINFO_MASKS (loop_vinfo),
8774                                      1, vectype, NULL);
8775             }
8776         }
8777       /* ???  Enable for loop costing as well.  */
8778       if (!loop_vinfo)
8779         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8780                           0, vect_epilogue);
8781       return true;
8782     }
8783
8784   /* Use the lhs of the original scalar statement.  */
8785   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8786   if (dump_enabled_p ())
8787     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8788                      "stmt %G", stmt);
8789
8790   lhs = gimple_get_lhs (stmt);
8791   lhs_type = TREE_TYPE (lhs);
8792
8793   bitsize = vector_element_bits_tree (vectype);
8794
8795   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8796   tree vec_lhs, bitstart;
8797   gimple *vec_stmt;
8798   if (slp_node)
8799     {
8800       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8801
8802       /* Get the correct slp vectorized stmt.  */
8803       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8804       vec_lhs = gimple_get_lhs (vec_stmt);
8805
8806       /* Get entry to use.  */
8807       bitstart = bitsize_int (vec_index);
8808       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8809     }
8810   else
8811     {
8812       /* For multiple copies, get the last copy.  */
8813       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8814       vec_lhs = gimple_get_lhs (vec_stmt);
8815
8816       /* Get the last lane in the vector.  */
8817       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8818     }
8819
8820   if (loop_vinfo)
8821     {
8822       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8823          requirement, insert one phi node for it.  It looks like:
8824            loop;
8825          BB:
8826            # lhs' = PHI <lhs>
8827          ==>
8828            loop;
8829          BB:
8830            # vec_lhs' = PHI <vec_lhs>
8831            new_tree = lane_extract <vec_lhs', ...>;
8832            lhs' = new_tree;  */
8833
8834       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8835       basic_block exit_bb = single_exit (loop)->dest;
8836       gcc_assert (single_pred_p (exit_bb));
8837
8838       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8839       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8840       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8841
8842       gimple_seq stmts = NULL;
8843       tree new_tree;
8844       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8845         {
8846           /* Emit:
8847
8848                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8849
8850              where VEC_LHS is the vectorized live-out result and MASK is
8851              the loop mask for the final iteration.  */
8852           gcc_assert (ncopies == 1 && !slp_node);
8853           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8854           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8855                                           1, vectype, 0);
8856           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8857                                           mask, vec_lhs_phi);
8858
8859           /* Convert the extracted vector element to the scalar type.  */
8860           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8861         }
8862       else
8863         {
8864           tree bftype = TREE_TYPE (vectype);
8865           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8866             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8867           new_tree = build3 (BIT_FIELD_REF, bftype,
8868                              vec_lhs_phi, bitsize, bitstart);
8869           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8870                                            &stmts, true, NULL_TREE);
8871         }
8872
8873       if (stmts)
8874         {
8875           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8876           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8877
8878           /* Remove existing phi from lhs and create one copy from new_tree.  */
8879           tree lhs_phi = NULL_TREE;
8880           gimple_stmt_iterator gsi;
8881           for (gsi = gsi_start_phis (exit_bb);
8882                !gsi_end_p (gsi); gsi_next (&gsi))
8883             {
8884               gimple *phi = gsi_stmt (gsi);
8885               if ((gimple_phi_arg_def (phi, 0) == lhs))
8886                 {
8887                   remove_phi_node (&gsi, false);
8888                   lhs_phi = gimple_phi_result (phi);
8889                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8890                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8891                   break;
8892                 }
8893             }
8894         }
8895
8896       /* Replace use of lhs with newly computed result.  If the use stmt is a
8897          single arg PHI, just replace all uses of PHI result.  It's necessary
8898          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8899       use_operand_p use_p;
8900       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8901         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8902             && !is_gimple_debug (use_stmt))
8903           {
8904             if (gimple_code (use_stmt) == GIMPLE_PHI
8905                 && gimple_phi_num_args (use_stmt) == 1)
8906               {
8907                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8908               }
8909             else
8910               {
8911                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8912                     SET_USE (use_p, new_tree);
8913               }
8914             update_stmt (use_stmt);
8915           }
8916     }
8917   else
8918     {
8919       /* For basic-block vectorization simply insert the lane-extraction.  */
8920       tree bftype = TREE_TYPE (vectype);
8921       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8922         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8923       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8924                               vec_lhs, bitsize, bitstart);
8925       gimple_seq stmts = NULL;
8926       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8927                                        &stmts, true, NULL_TREE);
8928       if (TREE_CODE (new_tree) == SSA_NAME
8929           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8930         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8931       if (is_a <gphi *> (vec_stmt))
8932         {
8933           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8934           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8935         }
8936       else
8937         {
8938           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8939           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8940         }
8941
8942       /* Replace use of lhs with newly computed result.  If the use stmt is a
8943          single arg PHI, just replace all uses of PHI result.  It's necessary
8944          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8945       use_operand_p use_p;
8946       stmt_vec_info use_stmt_info;
8947       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8948         if (!is_gimple_debug (use_stmt)
8949             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8950                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8951           {
8952             /* ???  This can happen when the live lane ends up being
8953                used in a vector construction code-generated by an
8954                external SLP node (and code-generation for that already
8955                happened).  See gcc.dg/vect/bb-slp-47.c.
8956                Doing this is what would happen if that vector CTOR
8957                were not code-generated yet so it is not too bad.
8958                ???  In fact we'd likely want to avoid this situation
8959                in the first place.  */
8960             if (TREE_CODE (new_tree) == SSA_NAME
8961                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8962                 && gimple_code (use_stmt) != GIMPLE_PHI
8963                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8964                                                 use_stmt))
8965               {
8966                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8967                 gcc_assert (code == CONSTRUCTOR
8968                             || code == VIEW_CONVERT_EXPR
8969                             || CONVERT_EXPR_CODE_P (code));
8970                 if (dump_enabled_p ())
8971                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8972                                    "Using original scalar computation for "
8973                                    "live lane because use preceeds vector "
8974                                    "def\n");
8975                 continue;
8976               }
8977             /* ???  It can also happen that we end up pulling a def into
8978                a loop where replacing out-of-loop uses would require
8979                a new LC SSA PHI node.  Retain the original scalar in
8980                those cases as well.  PR98064.  */
8981             if (TREE_CODE (new_tree) == SSA_NAME
8982                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8983                 && (gimple_bb (use_stmt)->loop_father
8984                     != gimple_bb (vec_stmt)->loop_father)
8985                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8986                                         gimple_bb (use_stmt)->loop_father))
8987               {
8988                 if (dump_enabled_p ())
8989                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8990                                    "Using original scalar computation for "
8991                                    "live lane because there is an out-of-loop "
8992                                    "definition for it\n");
8993                 continue;
8994               }
8995             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8996               SET_USE (use_p, new_tree);
8997             update_stmt (use_stmt);
8998           }
8999     }
9000
9001   return true;
9002 }
9003
9004 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9005
9006 static void
9007 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9008 {
9009   ssa_op_iter op_iter;
9010   imm_use_iterator imm_iter;
9011   def_operand_p def_p;
9012   gimple *ustmt;
9013
9014   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9015     {
9016       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9017         {
9018           basic_block bb;
9019
9020           if (!is_gimple_debug (ustmt))
9021             continue;
9022
9023           bb = gimple_bb (ustmt);
9024
9025           if (!flow_bb_inside_loop_p (loop, bb))
9026             {
9027               if (gimple_debug_bind_p (ustmt))
9028                 {
9029                   if (dump_enabled_p ())
9030                     dump_printf_loc (MSG_NOTE, vect_location,
9031                                      "killing debug use\n");
9032
9033                   gimple_debug_bind_reset_value (ustmt);
9034                   update_stmt (ustmt);
9035                 }
9036               else
9037                 gcc_unreachable ();
9038             }
9039         }
9040     }
9041 }
9042
9043 /* Given loop represented by LOOP_VINFO, return true if computation of
9044    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9045    otherwise.  */
9046
9047 static bool
9048 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9049 {
9050   /* Constant case.  */
9051   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9052     {
9053       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9054       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9055
9056       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9057       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9058       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9059         return true;
9060     }
9061
9062   widest_int max;
9063   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9064   /* Check the upper bound of loop niters.  */
9065   if (get_max_loop_iterations (loop, &max))
9066     {
9067       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9068       signop sgn = TYPE_SIGN (type);
9069       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9070       if (max < type_max)
9071         return true;
9072     }
9073   return false;
9074 }
9075
9076 /* Return a mask type with half the number of elements as OLD_TYPE,
9077    given that it should have mode NEW_MODE.  */
9078
9079 tree
9080 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9081 {
9082   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9083   return build_truth_vector_type_for_mode (nunits, new_mode);
9084 }
9085
9086 /* Return a mask type with twice as many elements as OLD_TYPE,
9087    given that it should have mode NEW_MODE.  */
9088
9089 tree
9090 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9091 {
9092   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9093   return build_truth_vector_type_for_mode (nunits, new_mode);
9094 }
9095
9096 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9097    contain a sequence of NVECTORS masks that each control a vector of type
9098    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9099    these vector masks with the vector version of SCALAR_MASK.  */
9100
9101 void
9102 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9103                        unsigned int nvectors, tree vectype, tree scalar_mask)
9104 {
9105   gcc_assert (nvectors != 0);
9106   if (masks->length () < nvectors)
9107     masks->safe_grow_cleared (nvectors, true);
9108   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9109   /* The number of scalars per iteration and the number of vectors are
9110      both compile-time constants.  */
9111   unsigned int nscalars_per_iter
9112     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9113                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9114
9115   if (scalar_mask)
9116     {
9117       scalar_cond_masked_key cond (scalar_mask, nvectors);
9118       loop_vinfo->scalar_cond_masked_set.add (cond);
9119     }
9120
9121   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9122     {
9123       rgm->max_nscalars_per_iter = nscalars_per_iter;
9124       rgm->type = truth_type_for (vectype);
9125       rgm->factor = 1;
9126     }
9127 }
9128
9129 /* Given a complete set of masks MASKS, extract mask number INDEX
9130    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9131    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9132
9133    See the comment above vec_loop_masks for more details about the mask
9134    arrangement.  */
9135
9136 tree
9137 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9138                     unsigned int nvectors, tree vectype, unsigned int index)
9139 {
9140   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9141   tree mask_type = rgm->type;
9142
9143   /* Populate the rgroup's mask array, if this is the first time we've
9144      used it.  */
9145   if (rgm->controls.is_empty ())
9146     {
9147       rgm->controls.safe_grow_cleared (nvectors, true);
9148       for (unsigned int i = 0; i < nvectors; ++i)
9149         {
9150           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9151           /* Provide a dummy definition until the real one is available.  */
9152           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9153           rgm->controls[i] = mask;
9154         }
9155     }
9156
9157   tree mask = rgm->controls[index];
9158   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9159                 TYPE_VECTOR_SUBPARTS (vectype)))
9160     {
9161       /* A loop mask for data type X can be reused for data type Y
9162          if X has N times more elements than Y and if Y's elements
9163          are N times bigger than X's.  In this case each sequence
9164          of N elements in the loop mask will be all-zero or all-one.
9165          We can then view-convert the mask so that each sequence of
9166          N elements is replaced by a single element.  */
9167       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9168                               TYPE_VECTOR_SUBPARTS (vectype)));
9169       gimple_seq seq = NULL;
9170       mask_type = truth_type_for (vectype);
9171       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9172       if (seq)
9173         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9174     }
9175   return mask;
9176 }
9177
9178 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9179    lengths for controlling an operation on VECTYPE.  The operation splits
9180    each element of VECTYPE into FACTOR separate subelements, measuring the
9181    length as a number of these subelements.  */
9182
9183 void
9184 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9185                       unsigned int nvectors, tree vectype, unsigned int factor)
9186 {
9187   gcc_assert (nvectors != 0);
9188   if (lens->length () < nvectors)
9189     lens->safe_grow_cleared (nvectors, true);
9190   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9191
9192   /* The number of scalars per iteration, scalar occupied bytes and
9193      the number of vectors are both compile-time constants.  */
9194   unsigned int nscalars_per_iter
9195     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9196                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9197
9198   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9199     {
9200       /* For now, we only support cases in which all loads and stores fall back
9201          to VnQI or none do.  */
9202       gcc_assert (!rgl->max_nscalars_per_iter
9203                   || (rgl->factor == 1 && factor == 1)
9204                   || (rgl->max_nscalars_per_iter * rgl->factor
9205                       == nscalars_per_iter * factor));
9206       rgl->max_nscalars_per_iter = nscalars_per_iter;
9207       rgl->type = vectype;
9208       rgl->factor = factor;
9209     }
9210 }
9211
9212 /* Given a complete set of length LENS, extract length number INDEX for an
9213    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9214
9215 tree
9216 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9217                    unsigned int nvectors, unsigned int index)
9218 {
9219   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9220
9221   /* Populate the rgroup's len array, if this is the first time we've
9222      used it.  */
9223   if (rgl->controls.is_empty ())
9224     {
9225       rgl->controls.safe_grow_cleared (nvectors, true);
9226       for (unsigned int i = 0; i < nvectors; ++i)
9227         {
9228           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9229           gcc_assert (len_type != NULL_TREE);
9230           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9231
9232           /* Provide a dummy definition until the real one is available.  */
9233           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9234           rgl->controls[i] = len;
9235         }
9236     }
9237
9238   return rgl->controls[index];
9239 }
9240
9241 /* Scale profiling counters by estimation for LOOP which is vectorized
9242    by factor VF.  */
9243
9244 static void
9245 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9246 {
9247   edge preheader = loop_preheader_edge (loop);
9248   /* Reduce loop iterations by the vectorization factor.  */
9249   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9250   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9251
9252   if (freq_h.nonzero_p ())
9253     {
9254       profile_probability p;
9255
9256       /* Avoid dropping loop body profile counter to 0 because of zero count
9257          in loop's preheader.  */
9258       if (!(freq_e == profile_count::zero ()))
9259         freq_e = freq_e.force_nonzero ();
9260       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9261       scale_loop_frequencies (loop, p);
9262     }
9263
9264   edge exit_e = single_exit (loop);
9265   exit_e->probability = profile_probability::always ()
9266                                  .apply_scale (1, new_est_niter + 1);
9267
9268   edge exit_l = single_pred_edge (loop->latch);
9269   profile_probability prob = exit_l->probability;
9270   exit_l->probability = exit_e->probability.invert ();
9271   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9272     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9273 }
9274
9275 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9276    latch edge values originally defined by it.  */
9277
9278 static void
9279 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9280                                      stmt_vec_info def_stmt_info)
9281 {
9282   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9283   if (!def || TREE_CODE (def) != SSA_NAME)
9284     return;
9285   stmt_vec_info phi_info;
9286   imm_use_iterator iter;
9287   use_operand_p use_p;
9288   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9289     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9290       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9291           && (phi_info = loop_vinfo->lookup_stmt (phi))
9292           && STMT_VINFO_RELEVANT_P (phi_info)
9293           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9294           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9295           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9296         {
9297           loop_p loop = gimple_bb (phi)->loop_father;
9298           edge e = loop_latch_edge (loop);
9299           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9300             {
9301               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9302               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9303               gcc_assert (phi_defs.length () == latch_defs.length ());
9304               for (unsigned i = 0; i < phi_defs.length (); ++i)
9305                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9306                              gimple_get_lhs (latch_defs[i]), e,
9307                              gimple_phi_arg_location (phi, e->dest_idx));
9308             }
9309         }
9310 }
9311
9312 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9313    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9314    stmt_vec_info.  */
9315
9316 static bool
9317 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9318                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9319 {
9320   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9321   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9322
9323   if (dump_enabled_p ())
9324     dump_printf_loc (MSG_NOTE, vect_location,
9325                      "------>vectorizing statement: %G", stmt_info->stmt);
9326
9327   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9328     vect_loop_kill_debug_uses (loop, stmt_info);
9329
9330   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9331       && !STMT_VINFO_LIVE_P (stmt_info))
9332     return false;
9333
9334   if (STMT_VINFO_VECTYPE (stmt_info))
9335     {
9336       poly_uint64 nunits
9337         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9338       if (!STMT_SLP_TYPE (stmt_info)
9339           && maybe_ne (nunits, vf)
9340           && dump_enabled_p ())
9341         /* For SLP VF is set according to unrolling factor, and not
9342            to vector size, hence for SLP this print is not valid.  */
9343         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9344     }
9345
9346   /* Pure SLP statements have already been vectorized.  We still need
9347      to apply loop vectorization to hybrid SLP statements.  */
9348   if (PURE_SLP_STMT (stmt_info))
9349     return false;
9350
9351   if (dump_enabled_p ())
9352     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9353
9354   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9355     *seen_store = stmt_info;
9356
9357   return true;
9358 }
9359
9360 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9361    in the hash_map with its corresponding values.  */
9362
9363 static tree
9364 find_in_mapping (tree t, void *context)
9365 {
9366   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9367
9368   tree *value = mapping->get (t);
9369   return value ? *value : t;
9370 }
9371
9372 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9373    original loop that has now been vectorized.
9374
9375    The inits of the data_references need to be advanced with the number of
9376    iterations of the main loop.  This has been computed in vect_do_peeling and
9377    is stored in parameter ADVANCE.  We first restore the data_references
9378    initial offset with the values recored in ORIG_DRS_INIT.
9379
9380    Since the loop_vec_info of this EPILOGUE was constructed for the original
9381    loop, its stmt_vec_infos all point to the original statements.  These need
9382    to be updated to point to their corresponding copies as well as the SSA_NAMES
9383    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9384
9385    The data_reference's connections also need to be updated.  Their
9386    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9387    stmt_vec_infos, their statements need to point to their corresponding copy,
9388    if they are gather loads or scatter stores then their reference needs to be
9389    updated to point to its corresponding copy and finally we set
9390    'base_misaligned' to false as we have already peeled for alignment in the
9391    prologue of the main loop.  */
9392
9393 static void
9394 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9395 {
9396   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9397   auto_vec<gimple *> stmt_worklist;
9398   hash_map<tree,tree> mapping;
9399   gimple *orig_stmt, *new_stmt;
9400   gimple_stmt_iterator epilogue_gsi;
9401   gphi_iterator epilogue_phi_gsi;
9402   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9403   basic_block *epilogue_bbs = get_loop_body (epilogue);
9404   unsigned i;
9405
9406   free (LOOP_VINFO_BBS (epilogue_vinfo));
9407   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9408
9409   /* Advance data_reference's with the number of iterations of the previous
9410      loop and its prologue.  */
9411   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9412
9413
9414   /* The EPILOGUE loop is a copy of the original loop so they share the same
9415      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9416      point to the copied statements.  We also create a mapping of all LHS' in
9417      the original loop and all the LHS' in the EPILOGUE and create worklists to
9418      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9419   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9420     {
9421       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9422            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9423         {
9424           new_stmt = epilogue_phi_gsi.phi ();
9425
9426           gcc_assert (gimple_uid (new_stmt) > 0);
9427           stmt_vinfo
9428             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9429
9430           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9431           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9432
9433           mapping.put (gimple_phi_result (orig_stmt),
9434                        gimple_phi_result (new_stmt));
9435           /* PHI nodes can not have patterns or related statements.  */
9436           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9437                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9438         }
9439
9440       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9441            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9442         {
9443           new_stmt = gsi_stmt (epilogue_gsi);
9444           if (is_gimple_debug (new_stmt))
9445             continue;
9446
9447           gcc_assert (gimple_uid (new_stmt) > 0);
9448           stmt_vinfo
9449             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9450
9451           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9452           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9453
9454           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9455             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9456
9457           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9458             {
9459               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9460               for (gimple_stmt_iterator gsi = gsi_start (seq);
9461                    !gsi_end_p (gsi); gsi_next (&gsi))
9462                 stmt_worklist.safe_push (gsi_stmt (gsi));
9463             }
9464
9465           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9466           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9467             {
9468               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9469               stmt_worklist.safe_push (stmt);
9470               /* Set BB such that the assert in
9471                 'get_initial_def_for_reduction' is able to determine that
9472                 the BB of the related stmt is inside this loop.  */
9473               gimple_set_bb (stmt,
9474                              gimple_bb (new_stmt));
9475               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9476               gcc_assert (related_vinfo == NULL
9477                           || related_vinfo == stmt_vinfo);
9478             }
9479         }
9480     }
9481
9482   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9483      using the original main loop and thus need to be updated to refer to the
9484      cloned variables used in the epilogue.  */
9485   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9486     {
9487       gimple *stmt = stmt_worklist[i];
9488       tree *new_op;
9489
9490       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9491         {
9492           tree op = gimple_op (stmt, j);
9493           if ((new_op = mapping.get(op)))
9494             gimple_set_op (stmt, j, *new_op);
9495           else
9496             {
9497               /* PR92429: The last argument of simplify_replace_tree disables
9498                  folding when replacing arguments.  This is required as
9499                  otherwise you might end up with different statements than the
9500                  ones analyzed in vect_loop_analyze, leading to different
9501                  vectorization.  */
9502               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9503                                           &find_in_mapping, &mapping, false);
9504               gimple_set_op (stmt, j, op);
9505             }
9506         }
9507     }
9508
9509   struct data_reference *dr;
9510   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9511   FOR_EACH_VEC_ELT (datarefs, i, dr)
9512     {
9513       orig_stmt = DR_STMT (dr);
9514       gcc_assert (gimple_uid (orig_stmt) > 0);
9515       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9516       /* Data references for gather loads and scatter stores do not use the
9517          updated offset we set using ADVANCE.  Instead we have to make sure the
9518          reference in the data references point to the corresponding copy of
9519          the original in the epilogue.  */
9520       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9521           == VMAT_GATHER_SCATTER)
9522         {
9523           DR_REF (dr)
9524             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9525                                      &find_in_mapping, &mapping);
9526           DR_BASE_ADDRESS (dr)
9527             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9528                                      &find_in_mapping, &mapping);
9529         }
9530       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9531       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9532       /* The vector size of the epilogue is smaller than that of the main loop
9533          so the alignment is either the same or lower. This means the dr will
9534          thus by definition be aligned.  */
9535       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9536     }
9537
9538   epilogue_vinfo->shared->datarefs_copy.release ();
9539   epilogue_vinfo->shared->save_datarefs ();
9540 }
9541
9542 /* Function vect_transform_loop.
9543
9544    The analysis phase has determined that the loop is vectorizable.
9545    Vectorize the loop - created vectorized stmts to replace the scalar
9546    stmts in the loop, and update the loop exit condition.
9547    Returns scalar epilogue loop if any.  */
9548
9549 class loop *
9550 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9551 {
9552   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9553   class loop *epilogue = NULL;
9554   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9555   int nbbs = loop->num_nodes;
9556   int i;
9557   tree niters_vector = NULL_TREE;
9558   tree step_vector = NULL_TREE;
9559   tree niters_vector_mult_vf = NULL_TREE;
9560   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9561   unsigned int lowest_vf = constant_lower_bound (vf);
9562   gimple *stmt;
9563   bool check_profitability = false;
9564   unsigned int th;
9565
9566   DUMP_VECT_SCOPE ("vec_transform_loop");
9567
9568   loop_vinfo->shared->check_datarefs ();
9569
9570   /* Use the more conservative vectorization threshold.  If the number
9571      of iterations is constant assume the cost check has been performed
9572      by our caller.  If the threshold makes all loops profitable that
9573      run at least the (estimated) vectorization factor number of times
9574      checking is pointless, too.  */
9575   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9576   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9577     {
9578       if (dump_enabled_p ())
9579         dump_printf_loc (MSG_NOTE, vect_location,
9580                          "Profitability threshold is %d loop iterations.\n",
9581                          th);
9582       check_profitability = true;
9583     }
9584
9585   /* Make sure there exists a single-predecessor exit bb.  Do this before
9586      versioning.   */
9587   edge e = single_exit (loop);
9588   if (! single_pred_p (e->dest))
9589     {
9590       split_loop_exit_edge (e, true);
9591       if (dump_enabled_p ())
9592         dump_printf (MSG_NOTE, "split exit edge\n");
9593     }
9594
9595   /* Version the loop first, if required, so the profitability check
9596      comes first.  */
9597
9598   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9599     {
9600       class loop *sloop
9601         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9602       sloop->force_vectorize = false;
9603       check_profitability = false;
9604     }
9605
9606   /* Make sure there exists a single-predecessor exit bb also on the
9607      scalar loop copy.  Do this after versioning but before peeling
9608      so CFG structure is fine for both scalar and if-converted loop
9609      to make slpeel_duplicate_current_defs_from_edges face matched
9610      loop closed PHI nodes on the exit.  */
9611   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9612     {
9613       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9614       if (! single_pred_p (e->dest))
9615         {
9616           split_loop_exit_edge (e, true);
9617           if (dump_enabled_p ())
9618             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9619         }
9620     }
9621
9622   tree niters = vect_build_loop_niters (loop_vinfo);
9623   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9624   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9625   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9626   tree advance;
9627   drs_init_vec orig_drs_init;
9628
9629   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9630                               &step_vector, &niters_vector_mult_vf, th,
9631                               check_profitability, niters_no_overflow,
9632                               &advance);
9633
9634   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9635       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9636     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9637                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9638
9639   if (niters_vector == NULL_TREE)
9640     {
9641       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9642           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9643           && known_eq (lowest_vf, vf))
9644         {
9645           niters_vector
9646             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9647                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9648           step_vector = build_one_cst (TREE_TYPE (niters));
9649         }
9650       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9651         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9652                                      &step_vector, niters_no_overflow);
9653       else
9654         /* vect_do_peeling subtracted the number of peeled prologue
9655            iterations from LOOP_VINFO_NITERS.  */
9656         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9657                                      &niters_vector, &step_vector,
9658                                      niters_no_overflow);
9659     }
9660
9661   /* 1) Make sure the loop header has exactly two entries
9662      2) Make sure we have a preheader basic block.  */
9663
9664   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9665
9666   split_edge (loop_preheader_edge (loop));
9667
9668   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9669     /* This will deal with any possible peeling.  */
9670     vect_prepare_for_masked_peels (loop_vinfo);
9671
9672   /* Schedule the SLP instances first, then handle loop vectorization
9673      below.  */
9674   if (!loop_vinfo->slp_instances.is_empty ())
9675     {
9676       DUMP_VECT_SCOPE ("scheduling SLP instances");
9677       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9678     }
9679
9680   /* FORNOW: the vectorizer supports only loops which body consist
9681      of one basic block (header + empty latch). When the vectorizer will
9682      support more involved loop forms, the order by which the BBs are
9683      traversed need to be reconsidered.  */
9684
9685   for (i = 0; i < nbbs; i++)
9686     {
9687       basic_block bb = bbs[i];
9688       stmt_vec_info stmt_info;
9689
9690       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9691            gsi_next (&si))
9692         {
9693           gphi *phi = si.phi ();
9694           if (dump_enabled_p ())
9695             dump_printf_loc (MSG_NOTE, vect_location,
9696                              "------>vectorizing phi: %G", phi);
9697           stmt_info = loop_vinfo->lookup_stmt (phi);
9698           if (!stmt_info)
9699             continue;
9700
9701           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9702             vect_loop_kill_debug_uses (loop, stmt_info);
9703
9704           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9705               && !STMT_VINFO_LIVE_P (stmt_info))
9706             continue;
9707
9708           if (STMT_VINFO_VECTYPE (stmt_info)
9709               && (maybe_ne
9710                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9711               && dump_enabled_p ())
9712             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9713
9714           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9715                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9716                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9717                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9718                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9719               && ! PURE_SLP_STMT (stmt_info))
9720             {
9721               if (dump_enabled_p ())
9722                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9723               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9724             }
9725         }
9726
9727       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9728            gsi_next (&si))
9729         {
9730           gphi *phi = si.phi ();
9731           stmt_info = loop_vinfo->lookup_stmt (phi);
9732           if (!stmt_info)
9733             continue;
9734
9735           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9736               && !STMT_VINFO_LIVE_P (stmt_info))
9737             continue;
9738
9739           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9740                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9741                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9742                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9743                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9744               && ! PURE_SLP_STMT (stmt_info))
9745             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9746         }
9747
9748       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9749            !gsi_end_p (si);)
9750         {
9751           stmt = gsi_stmt (si);
9752           /* During vectorization remove existing clobber stmts.  */
9753           if (gimple_clobber_p (stmt))
9754             {
9755               unlink_stmt_vdef (stmt);
9756               gsi_remove (&si, true);
9757               release_defs (stmt);
9758             }
9759           else
9760             {
9761               /* Ignore vector stmts created in the outer loop.  */
9762               stmt_info = loop_vinfo->lookup_stmt (stmt);
9763
9764               /* vector stmts created in the outer-loop during vectorization of
9765                  stmts in an inner-loop may not have a stmt_info, and do not
9766                  need to be vectorized.  */
9767               stmt_vec_info seen_store = NULL;
9768               if (stmt_info)
9769                 {
9770                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9771                     {
9772                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9773                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9774                            !gsi_end_p (subsi); gsi_next (&subsi))
9775                         {
9776                           stmt_vec_info pat_stmt_info
9777                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9778                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9779                                                     &si, &seen_store);
9780                         }
9781                       stmt_vec_info pat_stmt_info
9782                         = STMT_VINFO_RELATED_STMT (stmt_info);
9783                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9784                                                     &si, &seen_store))
9785                         maybe_set_vectorized_backedge_value (loop_vinfo,
9786                                                              pat_stmt_info);
9787                     }
9788                   else
9789                     {
9790                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9791                                                     &seen_store))
9792                         maybe_set_vectorized_backedge_value (loop_vinfo,
9793                                                              stmt_info);
9794                     }
9795                 }
9796               gsi_next (&si);
9797               if (seen_store)
9798                 {
9799                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9800                     /* Interleaving.  If IS_STORE is TRUE, the
9801                        vectorization of the interleaving chain was
9802                        completed - free all the stores in the chain.  */
9803                     vect_remove_stores (loop_vinfo,
9804                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9805                   else
9806                     /* Free the attached stmt_vec_info and remove the stmt.  */
9807                     loop_vinfo->remove_stmt (stmt_info);
9808                 }
9809             }
9810         }
9811
9812       /* Stub out scalar statements that must not survive vectorization.
9813          Doing this here helps with grouped statements, or statements that
9814          are involved in patterns.  */
9815       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9816            !gsi_end_p (gsi); gsi_next (&gsi))
9817         {
9818           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9819           if (!call || !gimple_call_internal_p (call))
9820             continue;
9821           internal_fn ifn = gimple_call_internal_fn (call);
9822           if (ifn == IFN_MASK_LOAD)
9823             {
9824               tree lhs = gimple_get_lhs (call);
9825               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9826                 {
9827                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9828                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9829                   gsi_replace (&gsi, new_stmt, true);
9830                 }
9831             }
9832           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9833             {
9834               tree lhs = gimple_get_lhs (call);
9835               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9836                 {
9837                   tree else_arg
9838                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9839                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9840                   gsi_replace (&gsi, new_stmt, true);
9841                 }
9842             }
9843         }
9844     }                           /* BBs in loop */
9845
9846   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9847      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9848   if (integer_onep (step_vector))
9849     niters_no_overflow = true;
9850   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9851                            niters_vector_mult_vf, !niters_no_overflow);
9852
9853   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9854   scale_profile_for_vect_loop (loop, assumed_vf);
9855
9856   /* True if the final iteration might not handle a full vector's
9857      worth of scalar iterations.  */
9858   bool final_iter_may_be_partial
9859     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9860   /* The minimum number of iterations performed by the epilogue.  This
9861      is 1 when peeling for gaps because we always need a final scalar
9862      iteration.  */
9863   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9864   /* +1 to convert latch counts to loop iteration counts,
9865      -min_epilogue_iters to remove iterations that cannot be performed
9866        by the vector code.  */
9867   int bias_for_lowest = 1 - min_epilogue_iters;
9868   int bias_for_assumed = bias_for_lowest;
9869   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9870   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9871     {
9872       /* When the amount of peeling is known at compile time, the first
9873          iteration will have exactly alignment_npeels active elements.
9874          In the worst case it will have at least one.  */
9875       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9876       bias_for_lowest += lowest_vf - min_first_active;
9877       bias_for_assumed += assumed_vf - min_first_active;
9878     }
9879   /* In these calculations the "- 1" converts loop iteration counts
9880      back to latch counts.  */
9881   if (loop->any_upper_bound)
9882     {
9883       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9884       loop->nb_iterations_upper_bound
9885         = (final_iter_may_be_partial
9886            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9887                             lowest_vf) - 1
9888            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9889                              lowest_vf) - 1);
9890       if (main_vinfo)
9891         {
9892           unsigned int bound;
9893           poly_uint64 main_iters
9894             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9895                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9896           main_iters
9897             = upper_bound (main_iters,
9898                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9899           if (can_div_away_from_zero_p (main_iters,
9900                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9901                                         &bound))
9902             loop->nb_iterations_upper_bound
9903               = wi::umin ((widest_int) (bound - 1),
9904                           loop->nb_iterations_upper_bound);
9905       }
9906   }
9907   if (loop->any_likely_upper_bound)
9908     loop->nb_iterations_likely_upper_bound
9909       = (final_iter_may_be_partial
9910          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9911                           + bias_for_lowest, lowest_vf) - 1
9912          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9913                            + bias_for_lowest, lowest_vf) - 1);
9914   if (loop->any_estimate)
9915     loop->nb_iterations_estimate
9916       = (final_iter_may_be_partial
9917          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9918                           assumed_vf) - 1
9919          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9920                            assumed_vf) - 1);
9921
9922   if (dump_enabled_p ())
9923     {
9924       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9925         {
9926           dump_printf_loc (MSG_NOTE, vect_location,
9927                            "LOOP VECTORIZED\n");
9928           if (loop->inner)
9929             dump_printf_loc (MSG_NOTE, vect_location,
9930                              "OUTER LOOP VECTORIZED\n");
9931           dump_printf (MSG_NOTE, "\n");
9932         }
9933       else
9934         dump_printf_loc (MSG_NOTE, vect_location,
9935                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9936                          GET_MODE_NAME (loop_vinfo->vector_mode));
9937     }
9938
9939   /* Loops vectorized with a variable factor won't benefit from
9940      unrolling/peeling.  */
9941   if (!vf.is_constant ())
9942     {
9943       loop->unroll = 1;
9944       if (dump_enabled_p ())
9945         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9946                          " variable-length vectorization factor\n");
9947     }
9948   /* Free SLP instances here because otherwise stmt reference counting
9949      won't work.  */
9950   slp_instance instance;
9951   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9952     vect_free_slp_instance (instance);
9953   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9954   /* Clear-up safelen field since its value is invalid after vectorization
9955      since vectorized loop can have loop-carried dependencies.  */
9956   loop->safelen = 0;
9957
9958   if (epilogue)
9959     {
9960       update_epilogue_loop_vinfo (epilogue, advance);
9961
9962       epilogue->simduid = loop->simduid;
9963       epilogue->force_vectorize = loop->force_vectorize;
9964       epilogue->dont_vectorize = false;
9965     }
9966
9967   return epilogue;
9968 }
9969
9970 /* The code below is trying to perform simple optimization - revert
9971    if-conversion for masked stores, i.e. if the mask of a store is zero
9972    do not perform it and all stored value producers also if possible.
9973    For example,
9974      for (i=0; i<n; i++)
9975        if (c[i])
9976         {
9977           p1[i] += 1;
9978           p2[i] = p3[i] +2;
9979         }
9980    this transformation will produce the following semi-hammock:
9981
9982    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9983      {
9984        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9985        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9986        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9987        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9988        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9989        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9990      }
9991 */
9992
9993 void
9994 optimize_mask_stores (class loop *loop)
9995 {
9996   basic_block *bbs = get_loop_body (loop);
9997   unsigned nbbs = loop->num_nodes;
9998   unsigned i;
9999   basic_block bb;
10000   class loop *bb_loop;
10001   gimple_stmt_iterator gsi;
10002   gimple *stmt;
10003   auto_vec<gimple *> worklist;
10004   auto_purge_vect_location sentinel;
10005
10006   vect_location = find_loop_location (loop);
10007   /* Pick up all masked stores in loop if any.  */
10008   for (i = 0; i < nbbs; i++)
10009     {
10010       bb = bbs[i];
10011       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10012            gsi_next (&gsi))
10013         {
10014           stmt = gsi_stmt (gsi);
10015           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10016             worklist.safe_push (stmt);
10017         }
10018     }
10019
10020   free (bbs);
10021   if (worklist.is_empty ())
10022     return;
10023
10024   /* Loop has masked stores.  */
10025   while (!worklist.is_empty ())
10026     {
10027       gimple *last, *last_store;
10028       edge e, efalse;
10029       tree mask;
10030       basic_block store_bb, join_bb;
10031       gimple_stmt_iterator gsi_to;
10032       tree vdef, new_vdef;
10033       gphi *phi;
10034       tree vectype;
10035       tree zero;
10036
10037       last = worklist.pop ();
10038       mask = gimple_call_arg (last, 2);
10039       bb = gimple_bb (last);
10040       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10041          the same loop as if_bb.  It could be different to LOOP when two
10042          level loop-nest is vectorized and mask_store belongs to the inner
10043          one.  */
10044       e = split_block (bb, last);
10045       bb_loop = bb->loop_father;
10046       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10047       join_bb = e->dest;
10048       store_bb = create_empty_bb (bb);
10049       add_bb_to_loop (store_bb, bb_loop);
10050       e->flags = EDGE_TRUE_VALUE;
10051       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10052       /* Put STORE_BB to likely part.  */
10053       efalse->probability = profile_probability::unlikely ();
10054       store_bb->count = efalse->count ();
10055       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10056       if (dom_info_available_p (CDI_DOMINATORS))
10057         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10058       if (dump_enabled_p ())
10059         dump_printf_loc (MSG_NOTE, vect_location,
10060                          "Create new block %d to sink mask stores.",
10061                          store_bb->index);
10062       /* Create vector comparison with boolean result.  */
10063       vectype = TREE_TYPE (mask);
10064       zero = build_zero_cst (vectype);
10065       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10066       gsi = gsi_last_bb (bb);
10067       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10068       /* Create new PHI node for vdef of the last masked store:
10069          .MEM_2 = VDEF <.MEM_1>
10070          will be converted to
10071          .MEM.3 = VDEF <.MEM_1>
10072          and new PHI node will be created in join bb
10073          .MEM_2 = PHI <.MEM_1, .MEM_3>
10074       */
10075       vdef = gimple_vdef (last);
10076       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10077       gimple_set_vdef (last, new_vdef);
10078       phi = create_phi_node (vdef, join_bb);
10079       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10080
10081       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10082       while (true)
10083         {
10084           gimple_stmt_iterator gsi_from;
10085           gimple *stmt1 = NULL;
10086
10087           /* Move masked store to STORE_BB.  */
10088           last_store = last;
10089           gsi = gsi_for_stmt (last);
10090           gsi_from = gsi;
10091           /* Shift GSI to the previous stmt for further traversal.  */
10092           gsi_prev (&gsi);
10093           gsi_to = gsi_start_bb (store_bb);
10094           gsi_move_before (&gsi_from, &gsi_to);
10095           /* Setup GSI_TO to the non-empty block start.  */
10096           gsi_to = gsi_start_bb (store_bb);
10097           if (dump_enabled_p ())
10098             dump_printf_loc (MSG_NOTE, vect_location,
10099                              "Move stmt to created bb\n%G", last);
10100           /* Move all stored value producers if possible.  */
10101           while (!gsi_end_p (gsi))
10102             {
10103               tree lhs;
10104               imm_use_iterator imm_iter;
10105               use_operand_p use_p;
10106               bool res;
10107
10108               /* Skip debug statements.  */
10109               if (is_gimple_debug (gsi_stmt (gsi)))
10110                 {
10111                   gsi_prev (&gsi);
10112                   continue;
10113                 }
10114               stmt1 = gsi_stmt (gsi);
10115               /* Do not consider statements writing to memory or having
10116                  volatile operand.  */
10117               if (gimple_vdef (stmt1)
10118                   || gimple_has_volatile_ops (stmt1))
10119                 break;
10120               gsi_from = gsi;
10121               gsi_prev (&gsi);
10122               lhs = gimple_get_lhs (stmt1);
10123               if (!lhs)
10124                 break;
10125
10126               /* LHS of vectorized stmt must be SSA_NAME.  */
10127               if (TREE_CODE (lhs) != SSA_NAME)
10128                 break;
10129
10130               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10131                 {
10132                   /* Remove dead scalar statement.  */
10133                   if (has_zero_uses (lhs))
10134                     {
10135                       gsi_remove (&gsi_from, true);
10136                       continue;
10137                     }
10138                 }
10139
10140               /* Check that LHS does not have uses outside of STORE_BB.  */
10141               res = true;
10142               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10143                 {
10144                   gimple *use_stmt;
10145                   use_stmt = USE_STMT (use_p);
10146                   if (is_gimple_debug (use_stmt))
10147                     continue;
10148                   if (gimple_bb (use_stmt) != store_bb)
10149                     {
10150                       res = false;
10151                       break;
10152                     }
10153                 }
10154               if (!res)
10155                 break;
10156
10157               if (gimple_vuse (stmt1)
10158                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10159                 break;
10160
10161               /* Can move STMT1 to STORE_BB.  */
10162               if (dump_enabled_p ())
10163                 dump_printf_loc (MSG_NOTE, vect_location,
10164                                  "Move stmt to created bb\n%G", stmt1);
10165               gsi_move_before (&gsi_from, &gsi_to);
10166               /* Shift GSI_TO for further insertion.  */
10167               gsi_prev (&gsi_to);
10168             }
10169           /* Put other masked stores with the same mask to STORE_BB.  */
10170           if (worklist.is_empty ()
10171               || gimple_call_arg (worklist.last (), 2) != mask
10172               || worklist.last () != stmt1)
10173             break;
10174           last = worklist.pop ();
10175         }
10176       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10177     }
10178 }
10179
10180 /* Decide whether it is possible to use a zero-based induction variable
10181    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10182    the value that the induction variable must be able to hold in order
10183    to ensure that the rgroups eventually have no active vector elements.
10184    Return -1 otherwise.  */
10185
10186 widest_int
10187 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10188 {
10189   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10190   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10191   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10192
10193   /* Calculate the value that the induction variable must be able
10194      to hit in order to ensure that we end the loop with an all-false mask.
10195      This involves adding the maximum number of inactive trailing scalar
10196      iterations.  */
10197   widest_int iv_limit = -1;
10198   if (max_loop_iterations (loop, &iv_limit))
10199     {
10200       if (niters_skip)
10201         {
10202           /* Add the maximum number of skipped iterations to the
10203              maximum iteration count.  */
10204           if (TREE_CODE (niters_skip) == INTEGER_CST)
10205             iv_limit += wi::to_widest (niters_skip);
10206           else
10207             iv_limit += max_vf - 1;
10208         }
10209       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10210         /* Make a conservatively-correct assumption.  */
10211         iv_limit += max_vf - 1;
10212
10213       /* IV_LIMIT is the maximum number of latch iterations, which is also
10214          the maximum in-range IV value.  Round this value down to the previous
10215          vector alignment boundary and then add an extra full iteration.  */
10216       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10217       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10218     }
10219   return iv_limit;
10220 }
10221
10222 /* For the given rgroup_controls RGC, check whether an induction variable
10223    would ever hit a value that produces a set of all-false masks or zero
10224    lengths before wrapping around.  Return true if it's possible to wrap
10225    around before hitting the desirable value, otherwise return false.  */
10226
10227 bool
10228 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10229 {
10230   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10231
10232   if (iv_limit == -1)
10233     return true;
10234
10235   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10236   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10237   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10238
10239   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10240     return true;
10241
10242   return false;
10243 }